(define-module (uniseg internal) #:use-module (ice-9 peg) #:use-module (ice-9 match) #:use-module (ice-9 textual-ports) #:use-module (ice-9 exceptions) #:use-module (ice-9 hash-table) #:use-module (ice-9 i18n) #:use-module (ice-9 format) #:use-module (ice-9 pretty-print) #:use-module (web uri) #:use-module (web client) #:use-module (web request) #:use-module (srfi srfi-1) #:use-module (srfi srfi-71) #:export (@line cpk-values cpk cons-hash-list! symbol-with-prefix hex-string->integer format-exception-msg in-surrogate-range cmdline-wget-or-file ranges->charset! make-line-processor)) ;; ;; Common PEG patterns ;; (define-peg-pattern @hex body (peg "[a-fA-F0-9]")) (define-peg-pattern @codepoint all (* @hex)) (define-peg-pattern @codepoint-range all (or (and @codepoint (ignore "..") @codepoint) @codepoint)) (define-peg-pattern @comment all (and (ignore "#") (* peg-any))) (define-peg-pattern @ws none (or " " "\t")) (define-peg-pattern @property all (* (peg "[a-zA-Z_]"))) (define-peg-pattern @datum body (and @codepoint-range (* @ws) (ignore ";") (* @ws) @property)) (define-peg-pattern @line body (and @datum (* @ws) @comment)) (define (symbol-with-prefix prefix symbol) (string->symbol (string-append prefix (symbol->string symbol)))) ;; Giant unicode code generation procedure creator (define (make-line-processor hashtable string->property properties symbols charsets-symbol stdout) (define properties-and-symbols (zip properties symbols)) (define (process-line line) (define tree (peg:tree (match-pattern @line line))) (unless (or (not tree) (null? tree) (eq? 'comment (car tree))) (match tree (((('@codepoint-range ('@codepoint codepoints) ___) ('@property prop-str)) ('@comment comment)) (with-exception-handler (λ (err) (format stdout "Skipping line due to error :: ") (format-exception-msg stdout err)) (λ () (let ((f (hex-string->integer (first codepoints))) (l (hex-string->integer (last codepoints))) (property (string->property prop-str comment))) (when (or (in-surrogate-range f) (in-surrogate-range l)) (error (format #f "chars in surrogate range ~x -> ~x" f l))) (cons-hash-list! hashtable property f l))) #:unwind? #t))))) (define (print-to-file) (pretty-print `(define hashtable (alist->hashq-table ',(hash-map->list cons hashtable)))) (display "\n") (for-each (λ (sym) (pretty-print `(define ,sym (char-set)))) symbols) (display "\n") (pretty-print `(define ,charsets-symbol (list ,@(map (λ (pair) (let ((f (first pair)) (s (second pair))) `(list ',f ,s))) properties-and-symbols)))) (display "\n") (for-each (λ (set-pair) (let ((name (first set-pair)) (symbol (second set-pair))) (pretty-print `(ranges->charset! hashtable ',name ,symbol)))) properties-and-symbols) (display "\n")) (values process-line print-to-file)) ;; Helper macro to add a list of character ranges ;; to a hash-set. (define (cons-hash-list! ht key low high) (let* ((old (hashq-ref ht key)) (value (list low high)) (new-lst (if old (cons value old) (list value)))) (hashq-set! ht key new-lst))) ;; Helper macro to take above character range hash table ;; and insert it into Guile's chasracter set system. (define-syntax-rule (ranges->charset! ht name symbol) (let* ((pairs (hashq-ref ht name))) (for-each (λ (pair) (ucs-range->char-set! (first pair) ;; Exclusive upper range, so add one (+ (second pair) 1) #t symbol)) pairs))) (define (hex-string->integer str) ;; XXX: We would ideally do integer->char here and save it to file as such ;; However read-expr* does not actually work for all the characters! ;; So they can't be written out as such. (locale-string->integer str 16)) ;; This should be in the standard library... (define (format-exception-msg port err) (apply format port (exception-message err) (exception-irritants err)) (display "\n" port)) ;; Little checker to see if we are in the danger zone ;; (surrogate range is invalid unicode and guile errors) (define (in-surrogate-range num) (and (>= num #xd800) (<= num #xdfff))) (define (cmdline-wget-or-file url log-port) (let ((cmdline (command-line))) (if (= 2 (length cmdline)) (file-to-lines (last cmdline) log-port) (wget-to-lines url log-port)))) (define (wget-to-lines url log-port) (format log-port "Downloading from ~a..." url) (let* ((response body (http-get (string->uri url))) (lines (string-split body #\newline))) (format log-port " done.\n") lines)) (define (file-to-lines path log-port) (format log-port "Loading from local file ~a\n" path) (string-split (with-input-from-file path (λ () (get-string-all (current-input-port)))) #\newline)) ;; kludge: emacs mis-renders this as a string, so... (define hash-semicolon-semicolon (list->string '(#\# #\; #\;))) ;; for debugging (define (cpk . vals) "Peek at values for print debugging, but return 'em" (display hash-semicolon-semicolon (current-error-port)) (display " cpk\n" (current-error-port)) (pretty-print vals (current-error-port)) ;; return the last value (last vals)) (define-syntax-rule (cpk-values print-these ... body) ;; Like pk, but supporting multiple value return (call-with-values (lambda () body) (lambda vals (display hash-semicolon-semicolon (current-error-port)) (display " cpk-values\n" (current-error-port)) (pretty-print (list print-these ... '*values:* vals) (current-error-port)) (apply values vals))))