Vivianne Langdon
31012d5b8f
- We change the stream iterator to *always* return a grapheme (except for EOF). The grapheme then gets built up over time. - This way, trans flag for example is first white flag, then white flag + zwj, etc until it finally transforms into the trans flag. - Users of the stream library can then use the `modification?' flag to determine if the stream value is a modification of the prior grapheme instead of a new grapheme. - Abstracted iteration to an iterator object to support use cases where we don't have an input stream (reflow needs this!)
96 lines
2.3 KiB
Scheme
96 lines
2.3 KiB
Scheme
#!@GUILE@ --no-auto-compile
|
|
-*- scheme -*-
|
|
!#
|
|
|
|
;; Can be called with a trailing argument pointing to the file on disk.
|
|
|
|
(use-modules
|
|
(uniseg internal)
|
|
(ice-9 pretty-print)
|
|
(ice-9 peg)
|
|
(ice-9 format)
|
|
(ice-9 exceptions)
|
|
(ice-9 match)
|
|
(ice-9 hash-table)
|
|
(srfi srfi-1))
|
|
|
|
(define stdout (current-output-port))
|
|
|
|
(define url
|
|
"https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt")
|
|
|
|
(define grapheme-ht (make-hash-table 13))
|
|
|
|
(define grapheme-properties
|
|
'(hangul-syllable-l
|
|
hangul-syllable-v
|
|
hangul-syllable-lv
|
|
hangul-syllable-lvt
|
|
prepend
|
|
carriage-return
|
|
line-feed
|
|
control
|
|
extend
|
|
regional-indicator
|
|
spacing-mark
|
|
zero-width-joiner))
|
|
|
|
(define grapheme-symbols
|
|
(map
|
|
(λ (prop) (symbol-with-prefix "char-set:grapheme-" prop))
|
|
grapheme-properties))
|
|
|
|
(define (string->property str comment)
|
|
(match str
|
|
("L" 'hangul-syllable-l)
|
|
("V" 'hangul-syllable-v)
|
|
("T" 'hangul-syllable-t)
|
|
("LV" 'hangul-syllable-lv)
|
|
("LVT" 'hangul-syllable-lvt)
|
|
("Prepend" 'prepend)
|
|
("CR" 'carriage-return)
|
|
("LF" 'line-feed)
|
|
("Control" 'control)
|
|
("Extend" 'extend)
|
|
("Regional_Indicator" 'regional-indicator)
|
|
("SpacingMark" 'spacing-mark)
|
|
("ZWJ" 'zero-width-joiner)))
|
|
|
|
(define file "uniseg/charsets/graphemes.scm")
|
|
|
|
(format stdout "Writing to ~a...\n" file)
|
|
|
|
(with-output-to-file file
|
|
(λ ()
|
|
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (basename (current-filename)))
|
|
|
|
(pretty-print
|
|
`(define-module (uniseg charsets graphemes)
|
|
#:use-module (ice-9 hash-table)
|
|
#:use-module (srfi srfi-1)
|
|
#:use-module (uniseg internal)
|
|
#:use-module (uniseg charsets emoji)
|
|
#:export (,@grapheme-symbols
|
|
grapheme-charsets)))
|
|
|
|
(define-values (process-line print-to-file)
|
|
(make-line-processor
|
|
grapheme-ht
|
|
string->property
|
|
grapheme-properties
|
|
grapheme-symbols
|
|
'grapheme-charsets
|
|
stdout))
|
|
|
|
(for-each process-line (cmdline-wget-or-file url stdout))
|
|
(print-to-file)
|
|
|
|
;; Need emoji in the set as well.
|
|
(pretty-print
|
|
`(set! grapheme-charsets
|
|
(cons (list 'extended-pictographic char-set:emoji-extended-pictographic)
|
|
grapheme-charsets)))
|
|
|
|
(display "Code generation complete.\n" stdout)))
|
|
|
|
(format stdout "Written to ~a.\n" file)
|