2024-03-03 18:13:22 +00:00
|
|
|
(define-module (uniseg internal)
|
2024-03-02 18:37:54 +00:00
|
|
|
#:use-module (ice-9 peg)
|
|
|
|
#:use-module (ice-9 textual-ports)
|
|
|
|
#:use-module (ice-9 exceptions)
|
|
|
|
#:use-module (ice-9 i18n)
|
2024-03-04 19:04:27 +00:00
|
|
|
#:use-module (ice-9 pretty-print)
|
2024-03-02 18:37:54 +00:00
|
|
|
#:use-module (web uri)
|
|
|
|
#:use-module (web client)
|
|
|
|
#:use-module (web request)
|
2024-03-02 23:54:38 +00:00
|
|
|
#:use-module (srfi srfi-1)
|
2024-03-02 18:37:54 +00:00
|
|
|
#:use-module (srfi srfi-71)
|
|
|
|
#:export (@hex
|
|
|
|
@codepoint
|
|
|
|
@codepoint-range
|
|
|
|
@comment
|
|
|
|
@ws
|
2024-03-04 19:04:27 +00:00
|
|
|
cpk-values
|
|
|
|
cpk
|
2024-03-02 23:54:38 +00:00
|
|
|
cons-hash-list!
|
2024-03-02 18:37:54 +00:00
|
|
|
hex-string->integer
|
|
|
|
format-exception-msg
|
|
|
|
in-surrogate-range
|
2024-03-04 19:27:12 +00:00
|
|
|
cmdline-wget-or-file
|
2024-03-02 23:54:38 +00:00
|
|
|
ranges->charset!))
|
2024-03-02 18:37:54 +00:00
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
;;
|
|
|
|
;; Common PEG patterns
|
|
|
|
;;
|
2024-03-02 18:37:54 +00:00
|
|
|
(define-peg-pattern @hex body (peg "[a-fA-F0-9]"))
|
|
|
|
|
|
|
|
(define-peg-pattern @codepoint all
|
|
|
|
(* @hex))
|
|
|
|
|
|
|
|
(define-peg-pattern @codepoint-range all
|
|
|
|
(or
|
|
|
|
(and @codepoint (ignore "..") @codepoint)
|
|
|
|
@codepoint))
|
|
|
|
|
|
|
|
(define-peg-pattern @comment all
|
|
|
|
(and (ignore "#") (* peg-any)))
|
|
|
|
|
|
|
|
(define-peg-pattern @ws none
|
|
|
|
(or " " "\t"))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
;; Helper macro to add a list of character ranges
|
|
|
|
;; to a hash-set.
|
2024-03-02 23:54:38 +00:00
|
|
|
(define-syntax-rule (cons-hash-list! ht key low high)
|
|
|
|
(let* ((old (hashq-ref ht key))
|
|
|
|
(value (list low high))
|
|
|
|
(new-lst
|
|
|
|
(if old
|
|
|
|
(cons value old)
|
|
|
|
(list value))))
|
|
|
|
(hashq-set! ht key new-lst)))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
;; Helper macro to take above character range hash table
|
|
|
|
;; and insert it into Guile's chasracter set system.
|
2024-03-02 23:54:38 +00:00
|
|
|
(define-syntax-rule (ranges->charset! ht name symbol)
|
|
|
|
(let* ((pairs (hashq-ref ht name)))
|
|
|
|
(for-each
|
|
|
|
(λ (pair)
|
|
|
|
(ucs-range->char-set!
|
|
|
|
(first pair)
|
|
|
|
;; Exclusive upper range, so add one
|
|
|
|
(+ (second pair) 1)
|
|
|
|
#t symbol))
|
|
|
|
pairs)))
|
|
|
|
|
2024-03-02 18:37:54 +00:00
|
|
|
(define (hex-string->integer str)
|
|
|
|
;; XXX: We would ideally do integer->char here and save it to file as such
|
|
|
|
;; However read-expr* does not actually work for all the characters!
|
|
|
|
;; So they can't be written out as such.
|
|
|
|
(locale-string->integer str 16))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
;; This should be in the standard library...
|
2024-03-02 18:37:54 +00:00
|
|
|
(define (format-exception-msg port err)
|
|
|
|
(apply format port (exception-message err) (exception-irritants err))
|
|
|
|
(display "\n" port))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
;; Little checker to see if we are in the danger zone
|
|
|
|
;; (surrogate range is invalid unicode and guile errors)
|
2024-03-02 18:37:54 +00:00
|
|
|
(define (in-surrogate-range num)
|
|
|
|
(and (>= num #xd800)
|
|
|
|
(<= num #xdfff)))
|
|
|
|
|
2024-03-04 19:27:12 +00:00
|
|
|
(define (cmdline-wget-or-file url log-port)
|
|
|
|
(let ((cmdline (command-line)))
|
|
|
|
(if (= 2 (length cmdline))
|
|
|
|
(file-to-lines (last cmdline) log-port)
|
|
|
|
(wget-to-lines url log-port))))
|
|
|
|
|
|
|
|
(define (wget-to-lines url log-port)
|
|
|
|
(format log-port "Downloading from ~a..." url)
|
2024-03-02 18:37:54 +00:00
|
|
|
|
|
|
|
(let* ((response body (http-get (string->uri url)))
|
|
|
|
(lines (string-split body #\newline)))
|
2024-03-04 19:27:12 +00:00
|
|
|
(format log-port " done.\n")
|
2024-03-02 18:37:54 +00:00
|
|
|
lines))
|
|
|
|
|
2024-03-04 19:27:12 +00:00
|
|
|
(define (file-to-lines path log-port)
|
|
|
|
(format log-port "Loading from local file ~a\n" path)
|
2024-03-02 18:37:54 +00:00
|
|
|
(string-split
|
|
|
|
(with-input-from-file path
|
|
|
|
(λ () (get-string-all (current-input-port)))) #\newline))
|
|
|
|
|
2024-03-04 19:04:27 +00:00
|
|
|
;; kludge: emacs mis-renders this as a string, so...
|
|
|
|
(define hash-semicolon-semicolon
|
|
|
|
(list->string '(#\# #\; #\;)))
|
2024-03-02 18:37:54 +00:00
|
|
|
|
2024-03-04 19:04:27 +00:00
|
|
|
;; for debugging
|
|
|
|
(define (cpk . vals)
|
|
|
|
"Peek at values for print debugging, but return 'em"
|
|
|
|
(display hash-semicolon-semicolon (current-error-port))
|
|
|
|
(display " cpk\n" (current-error-port))
|
|
|
|
(pretty-print vals (current-error-port))
|
|
|
|
;; return the last value
|
|
|
|
(last vals))
|
|
|
|
|
|
|
|
(define-syntax-rule (cpk-values print-these ... body)
|
|
|
|
;; Like pk, but supporting multiple value return
|
|
|
|
(call-with-values
|
|
|
|
(lambda () body)
|
|
|
|
(lambda vals
|
|
|
|
(display hash-semicolon-semicolon (current-error-port))
|
|
|
|
(display " cpk-values\n" (current-error-port))
|
|
|
|
(pretty-print (list print-these ... '*values:* vals)
|
|
|
|
(current-error-port))
|
|
|
|
(apply values vals))))
|