Adding check for east asian charset with tests

This commit is contained in:
Vivianne 2024-02-29 15:17:43 -05:00
parent a5bdb2e688
commit eaaceb3c56
5 changed files with 128 additions and 2 deletions

View file

@ -21,7 +21,7 @@
(files (libraries
((directory "runewidth" ())
(scheme-file "runewidth")))
(tests ((directory "tests" ())))
(tests ((directory "tests" ((scheme-file "test-posix")))))
(programs ())
(documentation
((org-file "README")

35
runewidth/hconfig.scm Normal file
View file

@ -0,0 +1,35 @@
(define-module
(runewidth hconfig)
#:use-module
(srfi srfi-26)
#:export
(%version
%author
%license
%copyright
%gettext-domain
G_
N_
init-nls
init-locale))
(define %version "0.1")
(define %author "Vivanne Langdon")
(define %license 'gpl3+)
(define %copyright '(2024))
(define %gettext-domain "guile-runewidth")
(define G_ identity)
(define N_ identity)
(define (init-nls) "Dummy as no NLS is used" #t)
(define (init-locale)
"Dummy as no NLS is used"
#t)

68
runewidth/posix.scm Normal file
View file

@ -0,0 +1,68 @@
(define-module (runewidth posix)
#:use-module (ice-9 regex)
#:export (east-asian?))
(define wide-locales
(list
"utf-8"
"utf8"
"jis"
"eucjp"
"euckr"
"euccn"
"sjis"
"cp932"
"cp51932"
"cp936"
"cp949"
"cp950"
"big5"
"gbk"
"gb2312"))
;; algorithm from:
;; https://github.com/mattn/go-runewidth/blob/master/runewidth_posix.go
;; For extracting the charset part of the locale string (some locales require this)
;; Note regex and capture group different as guile does not support 'non-capturing group' syntax
(define charset-regexp (make-regexp "^[a-z][a-z][a-z]?(_[A-Z][A-Z])?\\.(.+)"))
;; POSIX, C, C-asdfs or C.asdfdsf
(define c-or-posix-regexp (make-regexp "^(POSIX$|C($|[\\.-]))"))
(define (get-env-locale)
(or (getenv "LC_ALL")
(getenv "LC_CTYPE")
(getenv "LANG")
""))
(define* (east-asian? #:optional (locale (get-env-locale)))
"Check if a given locale (or the currently installed locale from the environment) is considered an east asian locale"
(unless (string? locale)
(error "`locale' must be a string"))
(define charset
(string-downcase
(or
;; Separating out locale from charset with our regex
(let ((locale-match (regexp-exec charset-regexp locale)))
(and
(regexp-match? locale-match)
(= 3 (match:count locale-match))
(match:substring locale-match 2)))
locale)))
(and
(not (regexp-exec c-or-posix-regexp locale))
(not (string-suffix? "@cjk_narrow" charset))
(let ((index-@ (string-index charset #\@)))
;; strip @foo from the end of the charset
(when index-@
(set! charset (substring/shared charset 0 index-@)))
(and (member charset wide-locales)
(or
(not (eq? #\u (string-ref charset 0)))
(string-prefix? "ja" locale)
(string-prefix? "ko" locale)
(string-prefix? "zh" locale))))))

23
tests/test-posix.scm Normal file
View file

@ -0,0 +1,23 @@
(define-module (tests test-posix)
#:use-module (runewidth posix)
#:use-module (srfi srfi-64))
(test-begin "test-posix")
(define (test-east expected charset)
(test-equal expected (east-asian? charset)))
(define data
'((#f "foo@cjk_narrow")
(#f "foo@cjk")
(#f "utf-8@cjk")
(#f "C")
(#f "POSIX")
(#f "C.UTF-8")
(#t "ja_JP.UTF-8")
(#t "ja_JP.CP932")
(#f "en_US.UTF-8")))
(for-each (λ (p) (apply test-east p)) data)
(test-end "test-posix")