Vivianne Langdon
dc90b8fd6c
- Have to break the match statement into specific section and non-specific section - (some ordering is still unclear to me, lets see if we can copy more go-uniseg unit tests) - Add emoji to the grapheme list - First character must be processed first - then state transition done on next char with peek! - Fix other off-by-one issues, now unit tests pass!
169 lines
4.3 KiB
Scheme
169 lines
4.3 KiB
Scheme
#!@GUILE@ --no-auto-compile
|
|
-*- scheme -*-
|
|
!#
|
|
|
|
;; Can be called with a trailing argument pointing to the file on disk.
|
|
|
|
(use-modules
|
|
(uniseg internal)
|
|
(ice-9 pretty-print)
|
|
(ice-9 peg)
|
|
(ice-9 format)
|
|
(ice-9 exceptions)
|
|
(ice-9 match)
|
|
(ice-9 hash-table)
|
|
(srfi srfi-1))
|
|
|
|
(define stdout (current-output-port))
|
|
|
|
(define grapheme-url
|
|
"https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt")
|
|
|
|
(define-peg-pattern @grapheme-category all
|
|
(* (peg "[a-zA-Z_]")))
|
|
|
|
(define-peg-pattern @grapheme-datum body
|
|
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @grapheme-category))
|
|
|
|
(define-peg-pattern @grapheme-line body
|
|
(and @grapheme-datum (* @ws) @comment))
|
|
|
|
(define grapheme-ht (make-hash-table 13))
|
|
|
|
(define grapheme-sets
|
|
'(hangul-syllable-l
|
|
hangul-syllable-v
|
|
hangul-syllable-lv
|
|
hangul-syllable-lvt
|
|
prepend
|
|
carriage-return
|
|
line-feed
|
|
control
|
|
extend
|
|
regional-indicator
|
|
spacing-mark
|
|
zero-width-joiner))
|
|
|
|
(define grapheme-symbol-names
|
|
(map
|
|
(λ (set)
|
|
(string->symbol
|
|
(string-concatenate
|
|
(list "char-set:grapheme-"
|
|
(symbol->string set)))))
|
|
grapheme-sets))
|
|
|
|
(define grapheme-sets-and-symbols
|
|
(zip grapheme-sets grapheme-symbol-names))
|
|
|
|
(define (process-grapheme-line line)
|
|
(define (string->category str)
|
|
(match str
|
|
("L" 'hangul-syllable-l)
|
|
("V" 'hangul-syllable-v)
|
|
("T" 'hangul-syllable-t)
|
|
("LV" 'hangul-syllable-lv)
|
|
("LVT" 'hangul-syllable-lvt)
|
|
("Prepend" 'prepend)
|
|
("CR" 'carriage-return)
|
|
("LF" 'line-feed)
|
|
("Control" 'control)
|
|
("Extend" 'extend)
|
|
("Regional_Indicator" 'regional-indicator)
|
|
("SpacingMark" 'spacing-mark)
|
|
("ZWJ" 'zero-width-joiner)))
|
|
|
|
(define tree (peg:tree (match-pattern @grapheme-line line)))
|
|
|
|
(unless (or (not tree)
|
|
(null? tree)
|
|
(eq? '@comment (car tree)))
|
|
|
|
(match tree
|
|
(((('@codepoint-range
|
|
('@codepoint codepoints) ...)
|
|
('@grapheme-category cat-str))
|
|
('@comment comment))
|
|
|
|
(with-exception-handler
|
|
(λ (err)
|
|
(format stdout "Skipping line due to error :: ")
|
|
(format-exception-msg stdout err))
|
|
(λ ()
|
|
(let ((f (hex-string->integer (first codepoints)))
|
|
(l (hex-string->integer (last codepoints)))
|
|
(category (string->category cat-str)))
|
|
|
|
(when (or (in-surrogate-range f)
|
|
(in-surrogate-range l))
|
|
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
|
|
|
|
(cons-hash-list! grapheme-ht category f l)))
|
|
#:unwind? #t)))))
|
|
|
|
(define line-func
|
|
(if (= 2 (length (command-line)))
|
|
(λ ()
|
|
(file-to-lines (last (command-line)) stdout))
|
|
(λ ()
|
|
(wget-to-lines grapheme-url stdout))))
|
|
|
|
(define file "uniseg/graphemes.scm")
|
|
|
|
(format stdout "Writing to ~a...\n" file)
|
|
|
|
(with-output-to-file file
|
|
(λ ()
|
|
(format #t ";; Code generated by ~a. DO NOT EDIT\n\n" (first (command-line)))
|
|
|
|
(for-each process-grapheme-line (line-func))
|
|
|
|
(pretty-print
|
|
`(define-module (uniseg graphemes)
|
|
#:use-module (ice-9 hash-table)
|
|
#:use-module (srfi srfi-1)
|
|
#:use-module (uniseg internal)
|
|
#:use-module (uniseg emoji)
|
|
#:export (,@grapheme-symbol-names
|
|
grapheme-charsets)))
|
|
|
|
(pretty-print
|
|
`(define grapheme-ht
|
|
(alist->hashq-table ',(hash-map->list cons grapheme-ht))))
|
|
|
|
(display "\n")
|
|
|
|
(for-each
|
|
(λ (sym)
|
|
(pretty-print
|
|
`(define ,sym (char-set))))
|
|
grapheme-symbol-names)
|
|
|
|
(display "\n")
|
|
|
|
(pretty-print
|
|
`(define grapheme-charsets
|
|
(list
|
|
,@(map
|
|
(λ (pair)
|
|
(let ((f (first pair))
|
|
(s (second pair)))
|
|
`(list ',f ,s)))
|
|
grapheme-sets-and-symbols)
|
|
;; Need emoji in this set too!
|
|
(list 'extended-pictographic char-set:emoji-extended-pictographic))))
|
|
|
|
|
|
(display "\n")
|
|
|
|
(for-each
|
|
(λ (set-pair)
|
|
(let ((name (first set-pair))
|
|
(symbol (second set-pair)))
|
|
(pretty-print
|
|
`(ranges->charset! grapheme-ht ',name ,symbol))))
|
|
grapheme-sets-and-symbols)
|
|
|
|
(display "Code generation complete.\n" stdout)))
|
|
|
|
(format stdout "Written to ~a.\n" file)
|