guile-uniseg/uniseg/internal.scm

222 lines
6.1 KiB
Scheme

(define-module (uniseg internal)
#:use-module (ice-9 peg)
#:use-module (ice-9 match)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 exceptions)
#:use-module (ice-9 hash-table)
#:use-module (ice-9 i18n)
#:use-module (ice-9 format)
#:use-module (ice-9 pretty-print)
#:use-module (web uri)
#:use-module (web client)
#:use-module (web request)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-71)
#:export (@line
cpk-values
cpk
cons-hash-list!
symbol-with-prefix
hex-string->integer
format-exception-msg
in-surrogate-range
cmdline-wget-or-file
ranges->charset!
make-line-processor))
;;
;; Common PEG patterns
;;
(define-peg-pattern @hex body (peg "[a-fA-F0-9]"))
(define-peg-pattern @codepoint all
(* @hex))
(define-peg-pattern @codepoint-range all
(or
(and @codepoint (ignore "..") @codepoint)
@codepoint))
(define-peg-pattern @comment all
(and (ignore "#") (* peg-any)))
(define-peg-pattern @ws none
(or " " "\t"))
(define-peg-pattern @property all
(* (peg "[a-zA-Z_]")))
(define-peg-pattern @datum body
(and @codepoint-range (* @ws) (ignore ";") (* @ws) @property))
(define-peg-pattern @line body
(and @datum (* @ws) @comment))
(define (symbol-with-prefix prefix symbol)
(string->symbol
(string-append prefix (symbol->string symbol))))
;; Giant unicode code generation procedure creator
(define (make-line-processor
hashtable
string->property
properties
symbols
charsets-symbol
stdout)
(define properties-and-symbols
(zip properties symbols))
(define (process-line line)
(define tree (peg:tree (match-pattern @line line)))
(unless
(or (not tree)
(null? tree)
(eq? 'comment (car tree)))
(match tree
(((('@codepoint-range
('@codepoint codepoints) ___)
('@property prop-str))
('@comment comment))
(with-exception-handler
(λ (err)
(format stdout "Skipping line due to error :: ")
(format-exception-msg stdout err))
(λ ()
(let ((f (hex-string->integer (first codepoints)))
(l (hex-string->integer (last codepoints)))
(property (string->property prop-str comment)))
(when (or (in-surrogate-range f)
(in-surrogate-range l))
(error (format #f "chars in surrogate range ~x -> ~x" f l)))
(cons-hash-list! hashtable property f l)))
#:unwind? #t)))))
(define (print-to-file)
(pretty-print
`(define hashtable
(alist->hashq-table ',(hash-map->list cons hashtable))))
(display "\n")
(for-each
(λ (sym)
(pretty-print
`(define ,sym (char-set))))
symbols)
(display "\n")
(pretty-print
`(define ,charsets-symbol
(list
,@(map
(λ (pair)
(let ((f (first pair))
(s (second pair)))
`(list ',f ,s)))
properties-and-symbols))))
(display "\n")
(for-each
(λ (set-pair)
(let ((name (first set-pair))
(symbol (second set-pair)))
(pretty-print
`(ranges->charset! hashtable ',name ,symbol))))
properties-and-symbols)
(display "\n"))
(values process-line print-to-file))
;; Helper macro to add a list of character ranges
;; to a hash-set.
(define (cons-hash-list! ht key low high)
(let* ((old (hashq-ref ht key))
(value (list low high))
(new-lst
(if old
(cons value old)
(list value))))
(hashq-set! ht key new-lst)))
;; Helper macro to take above character range hash table
;; and insert it into Guile's chasracter set system.
(define-syntax-rule (ranges->charset! ht name symbol)
(let* ((pairs (hashq-ref ht name)))
(for-each
(λ (pair)
(ucs-range->char-set!
(first pair)
;; Exclusive upper range, so add one
(+ (second pair) 1)
#t symbol))
pairs)))
(define (hex-string->integer str)
;; XXX: We would ideally do integer->char here and save it to file as such
;; However read-expr* does not actually work for all the characters!
;; So they can't be written out as such.
(locale-string->integer str 16))
;; This should be in the standard library...
(define (format-exception-msg port err)
(apply format port (exception-message err) (exception-irritants err))
(display "\n" port))
;; Little checker to see if we are in the danger zone
;; (surrogate range is invalid unicode and guile errors)
(define (in-surrogate-range num)
(and (>= num #xd800)
(<= num #xdfff)))
(define (cmdline-wget-or-file url log-port)
(let ((cmdline (command-line)))
(if (= 2 (length cmdline))
(file-to-lines (last cmdline) log-port)
(wget-to-lines url log-port))))
(define (wget-to-lines url log-port)
(format log-port "Downloading from ~a..." url)
(let* ((response body (http-get (string->uri url)))
(lines (string-split body #\newline)))
(format log-port " done.\n")
lines))
(define (file-to-lines path log-port)
(format log-port "Loading from local file ~a\n" path)
(string-split
(with-input-from-file path
(λ () (get-string-all (current-input-port)))) #\newline))
;; kludge: emacs mis-renders this as a string, so...
(define hash-semicolon-semicolon
(list->string '(#\# #\; #\;)))
;; for debugging
(define (cpk . vals)
"Peek at values for print debugging, but return 'em"
(display hash-semicolon-semicolon (current-error-port))
(display " cpk\n" (current-error-port))
(pretty-print vals (current-error-port))
;; return the last value
(last vals))
(define-syntax-rule (cpk-values print-these ... body)
;; Like pk, but supporting multiple value return
(call-with-values
(lambda () body)
(lambda vals
(display hash-semicolon-semicolon (current-error-port))
(display " cpk-values\n" (current-error-port))
(pretty-print (list print-these ... '*values:* vals)
(current-error-port))
(apply values vals))))