From eaaceb3c56cec0aa0760cca17fb2d751f989b822 Mon Sep 17 00:00:00 2001 From: Vivianne Langdon Date: Thu, 29 Feb 2024 15:17:43 -0500 Subject: [PATCH] Adding check for east asian charset with tests --- guix.scm | 2 +- hall.scm | 2 +- runewidth/hconfig.scm | 35 ++++++++++++++++++++++ runewidth/posix.scm | 68 +++++++++++++++++++++++++++++++++++++++++++ tests/test-posix.scm | 23 +++++++++++++++ 5 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 runewidth/hconfig.scm create mode 100644 runewidth/posix.scm create mode 100644 tests/test-posix.scm diff --git a/guix.scm b/guix.scm index 34bd86d..d263790 100644 --- a/guix.scm +++ b/guix.scm @@ -33,7 +33,7 @@ (propagated-inputs (list)) (synopsis "Library for utf-8 rune widths") (description - "A library that provides guile character sets and operations to work with runes that are more than a single character width.") + "A library that provides guile character sets and operations to work with runes that are more than a single character width.") (home-page "https://git.solarpunk.moe/vv/guile-runewidth") (license license:gpl3+)) diff --git a/hall.scm b/hall.scm index e6865f4..2b70a9a 100644 --- a/hall.scm +++ b/hall.scm @@ -21,7 +21,7 @@ (files (libraries ((directory "runewidth" ()) (scheme-file "runewidth"))) - (tests ((directory "tests" ()))) + (tests ((directory "tests" ((scheme-file "test-posix"))))) (programs ()) (documentation ((org-file "README") diff --git a/runewidth/hconfig.scm b/runewidth/hconfig.scm new file mode 100644 index 0000000..da1f4b9 --- /dev/null +++ b/runewidth/hconfig.scm @@ -0,0 +1,35 @@ +(define-module + (runewidth hconfig) + #:use-module + (srfi srfi-26) + #:export + (%version + %author + %license + %copyright + %gettext-domain + G_ + N_ + init-nls + init-locale)) + +(define %version "0.1") + +(define %author "Vivanne Langdon") + +(define %license 'gpl3+) + +(define %copyright '(2024)) + +(define %gettext-domain "guile-runewidth") + +(define G_ identity) + +(define N_ identity) + +(define (init-nls) "Dummy as no NLS is used" #t) + +(define (init-locale) + "Dummy as no NLS is used" + #t) + diff --git a/runewidth/posix.scm b/runewidth/posix.scm new file mode 100644 index 0000000..89f4fc2 --- /dev/null +++ b/runewidth/posix.scm @@ -0,0 +1,68 @@ +(define-module (runewidth posix) + #:use-module (ice-9 regex) + #:export (east-asian?)) + +(define wide-locales + (list + "utf-8" + "utf8" + "jis" + "eucjp" + "euckr" + "euccn" + "sjis" + "cp932" + "cp51932" + "cp936" + "cp949" + "cp950" + "big5" + "gbk" + "gb2312")) + +;; algorithm from: +;; https://github.com/mattn/go-runewidth/blob/master/runewidth_posix.go + +;; For extracting the charset part of the locale string (some locales require this) +;; Note regex and capture group different as guile does not support 'non-capturing group' syntax +(define charset-regexp (make-regexp "^[a-z][a-z][a-z]?(_[A-Z][A-Z])?\\.(.+)")) + +;; POSIX, C, C-asdfs or C.asdfdsf +(define c-or-posix-regexp (make-regexp "^(POSIX$|C($|[\\.-]))")) + +(define (get-env-locale) + (or (getenv "LC_ALL") + (getenv "LC_CTYPE") + (getenv "LANG") + "")) + +(define* (east-asian? #:optional (locale (get-env-locale))) + "Check if a given locale (or the currently installed locale from the environment) is considered an east asian locale" + (unless (string? locale) + (error "`locale' must be a string")) + + (define charset + (string-downcase + (or + ;; Separating out locale from charset with our regex + (let ((locale-match (regexp-exec charset-regexp locale))) + (and + (regexp-match? locale-match) + (= 3 (match:count locale-match)) + (match:substring locale-match 2))) + locale))) + + (and + (not (regexp-exec c-or-posix-regexp locale)) + (not (string-suffix? "@cjk_narrow" charset)) + (let ((index-@ (string-index charset #\@))) + ;; strip @foo from the end of the charset + (when index-@ + (set! charset (substring/shared charset 0 index-@))) + + (and (member charset wide-locales) + (or + (not (eq? #\u (string-ref charset 0))) + (string-prefix? "ja" locale) + (string-prefix? "ko" locale) + (string-prefix? "zh" locale)))))) diff --git a/tests/test-posix.scm b/tests/test-posix.scm new file mode 100644 index 0000000..efc1d68 --- /dev/null +++ b/tests/test-posix.scm @@ -0,0 +1,23 @@ +(define-module (tests test-posix) + #:use-module (runewidth posix) + #:use-module (srfi srfi-64)) + +(test-begin "test-posix") + +(define (test-east expected charset) + (test-equal expected (east-asian? charset))) + +(define data + '((#f "foo@cjk_narrow") + (#f "foo@cjk") + (#f "utf-8@cjk") + (#f "C") + (#f "POSIX") + (#f "C.UTF-8") + (#t "ja_JP.UTF-8") + (#t "ja_JP.CP932") + (#f "en_US.UTF-8"))) + +(for-each (λ (p) (apply test-east p)) data) + +(test-end "test-posix")