Adding check for east asian charset with tests

2024-02-29 15:17:43 -05:00 · 2024-02-29 15:17:43 -05:00 · eaaceb3c56
commit eaaceb3c56
parent a5bdb2e688
5 changed files with 128 additions and 2 deletions
--- a/hall.scm
+++ b/hall.scm
@ -21,7 +21,7 @@
  (files (libraries
           ((directory "runewidth" ())
            (scheme-file "runewidth")))
-         (tests ((directory "tests" ())))
+         (tests ((directory "tests" ((scheme-file "test-posix")))))
         (programs ())
         (documentation
           ((org-file "README")
--- a/runewidth/hconfig.scm
+++ b/runewidth/hconfig.scm
@ -0,0 +1,35 @@
+(define-module
+  (runewidth hconfig)
+  #:use-module
+  (srfi srfi-26)
+  #:export
+  (%version
+    %author
+    %license
+    %copyright
+    %gettext-domain
+    G_
+    N_
+    init-nls
+    init-locale))
+
+(define %version "0.1")
+
+(define %author "Vivanne Langdon")
+
+(define %license 'gpl3+)
+
+(define %copyright '(2024))
+
+(define %gettext-domain "guile-runewidth")
+
+(define G_ identity)
+
+(define N_ identity)
+
+(define (init-nls) "Dummy as no NLS is used" #t)
+
+(define (init-locale)
+  "Dummy as no NLS is used"
+  #t)
+
--- a/runewidth/posix.scm
+++ b/runewidth/posix.scm
@ -0,0 +1,68 @@
+(define-module (runewidth posix)
+  #:use-module (ice-9 regex)
+  #:export (east-asian?))
+
+(define wide-locales
+  (list
+	 "utf-8"
+	 "utf8"
+	 "jis"
+	 "eucjp"
+	 "euckr"
+	 "euccn"
+	 "sjis"
+	 "cp932"
+	 "cp51932"
+	 "cp936"
+	 "cp949"
+	 "cp950"
+	 "big5"
+	 "gbk"
+	 "gb2312"))
+
+;; algorithm from:
+;; https://github.com/mattn/go-runewidth/blob/master/runewidth_posix.go
+
+;; For extracting the charset part of the locale string (some locales require this)
+;; Note regex and capture group different as guile does not support 'non-capturing group' syntax
+(define charset-regexp (make-regexp "^[a-z][a-z][a-z]?(_[A-Z][A-Z])?\\.(.+)"))
+
+;; POSIX, C, C-asdfs or C.asdfdsf
+(define c-or-posix-regexp (make-regexp "^(POSIX$|C($|[\\.-]))"))
+
+(define (get-env-locale)
+  (or (getenv "LC_ALL")
+      (getenv "LC_CTYPE")
+      (getenv "LANG")
+      ""))
+
+(define* (east-asian? #:optional (locale (get-env-locale)))
+  "Check if a given locale (or the currently installed locale from the environment) is considered an east asian locale"
+  (unless (string? locale)
+    (error "`locale' must be a string"))
+
+  (define charset
+    (string-downcase
+     (or
+      ;; Separating out locale from charset with our regex
+      (let ((locale-match (regexp-exec charset-regexp locale)))
+        (and
+         (regexp-match? locale-match)
+         (= 3 (match:count locale-match))
+         (match:substring locale-match 2)))
+      locale)))
+
+  (and
+   (not (regexp-exec c-or-posix-regexp locale))
+   (not (string-suffix? "@cjk_narrow" charset))
+   (let ((index-@ (string-index charset #\@)))
+     ;; strip @foo from the end of the charset
+     (when index-@
+       (set! charset (substring/shared charset 0 index-@)))
+
+     (and (member charset wide-locales)
+          (or
+           (not (eq? #\u (string-ref charset 0)))
+           (string-prefix? "ja" locale)
+           (string-prefix? "ko" locale)
+           (string-prefix? "zh" locale))))))
--- a/tests/test-posix.scm
+++ b/tests/test-posix.scm
@ -0,0 +1,23 @@
+(define-module (tests test-posix)
+  #:use-module (runewidth posix)
+  #:use-module (srfi srfi-64))
+
+(test-begin "test-posix")
+
+(define (test-east expected charset)
+  (test-equal expected (east-asian? charset)))
+
+(define data
+  '((#f "foo@cjk_narrow")
+    (#f "foo@cjk")
+    (#f "utf-8@cjk")
+    (#f "C")
+    (#f "POSIX")
+    (#f "C.UTF-8")
+    (#t "ja_JP.UTF-8")
+    (#t "ja_JP.CP932")
+    (#f "en_US.UTF-8")))
+
+(for-each (λ (p) (apply test-east p)) data)
+
+(test-end "test-posix")