2024-03-03 18:13:22 +00:00
|
|
|
(define-module (uniseg)
|
|
|
|
#:use-module (srfi srfi-1)
|
|
|
|
#:use-module (ice-9 match)
|
2024-03-04 01:25:47 +00:00
|
|
|
#:use-module (srfi srfi-41)
|
2024-03-03 18:13:22 +00:00
|
|
|
#:use-module (uniseg emoji)
|
|
|
|
#:use-module (uniseg graphemes)
|
2024-03-04 01:25:47 +00:00
|
|
|
#:use-module (uniseg graphemes stream)
|
2024-03-03 18:13:22 +00:00
|
|
|
#:use-module (uniseg eastasian)
|
|
|
|
#:export (emoji?
|
2024-03-04 01:25:47 +00:00
|
|
|
char->grapheme-property
|
|
|
|
char->eastasian-property
|
|
|
|
char-width
|
|
|
|
string-width))
|
2024-03-03 18:13:22 +00:00
|
|
|
|
|
|
|
(define (emoji? char)
|
2024-03-04 01:25:47 +00:00
|
|
|
(char-set-contains? char-set:emoji-extended-pictographic char))
|
2024-03-03 18:13:22 +00:00
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
(define (char->prop sets char)
|
2024-03-03 18:13:22 +00:00
|
|
|
(let ((pair
|
|
|
|
(find
|
|
|
|
(λ (p) (char-set-contains? (second p) char))
|
|
|
|
sets)))
|
|
|
|
(and pair
|
|
|
|
(first pair))))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
(define (char->grapheme-property char)
|
2024-03-03 18:27:11 +00:00
|
|
|
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
|
2024-03-04 01:25:47 +00:00
|
|
|
(or (char->prop grapheme-charsets char)
|
2024-03-03 18:13:22 +00:00
|
|
|
'other))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
(define (char->eastasian-property char)
|
2024-03-03 18:27:11 +00:00
|
|
|
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
|
2024-03-03 18:13:22 +00:00
|
|
|
(define (<==> lo hi)
|
|
|
|
"Create a range predicate, inclusive between two numbers"
|
|
|
|
(λ (num)
|
|
|
|
(and (>= num lo)
|
|
|
|
(<= num hi))))
|
|
|
|
|
2024-03-03 18:27:11 +00:00
|
|
|
;; The spec requires that any values not in the table be double-width, if
|
|
|
|
;; they are in certain ranges. So let's take care of that here.
|
|
|
|
;; These characters are not in the char-sets as they are undesignated.
|
|
|
|
;; See comments in unicode's EastAsianWidth.txt
|
2024-03-03 18:13:22 +00:00
|
|
|
(define range-unified-ideographs-ext-a?
|
|
|
|
(<==> #x3400 #x4DBF))
|
|
|
|
(define range-unified-ideographs?
|
|
|
|
(<==> #x4E00 #x9FFF))
|
|
|
|
(define range-compatibility-ideographs?
|
|
|
|
(<==> #xF900 #xFAFF))
|
|
|
|
(define range-plane-2?
|
|
|
|
(<==> #x20000 #x2FFFD))
|
|
|
|
(define range-plane-3?
|
|
|
|
(<==> #x30000 #x3FFFD))
|
|
|
|
|
2024-03-04 01:25:47 +00:00
|
|
|
(or (char->prop eastasian-charsets char)
|
2024-03-03 18:13:22 +00:00
|
|
|
(let ((int (char->integer char)))
|
|
|
|
(match int
|
|
|
|
((or (? range-unified-ideographs-ext-a?)
|
|
|
|
(? range-unified-ideographs?)
|
|
|
|
(? range-compatibility-ideographs?)
|
|
|
|
(? range-plane-2?)
|
|
|
|
(? range-plane-3?))
|
|
|
|
'doublewidth)
|
|
|
|
(else
|
|
|
|
'neutral)))))
|
2024-03-04 01:25:47 +00:00
|
|
|
|
|
|
|
(define (char-width rune)
|
|
|
|
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
|
|
|
|
(define grapheme-prop (char->grapheme-property rune))
|
|
|
|
(values
|
|
|
|
(or
|
|
|
|
(match grapheme-prop
|
|
|
|
((or 'control
|
|
|
|
'carriage-return
|
|
|
|
'line-feed
|
|
|
|
'extend
|
|
|
|
'zero-width-joiner)
|
|
|
|
0)
|
|
|
|
('regional-indicator
|
|
|
|
2)
|
|
|
|
('extended-pictographic
|
|
|
|
(if
|
|
|
|
(char-set-contains? char-set:emoji-presentation rune)
|
|
|
|
2
|
|
|
|
1))
|
|
|
|
(else #f)
|
|
|
|
)
|
|
|
|
(match (char->integer rune)
|
|
|
|
(#x2E3A 3)
|
|
|
|
(#x2E3B 4)
|
|
|
|
(else #f))
|
|
|
|
(match (char->eastasian-property rune)
|
|
|
|
('doublewidth 2)
|
|
|
|
;; Default for monospace is to render ambiguous as 1
|
|
|
|
('ambiguous 1)
|
|
|
|
(else 1)))
|
|
|
|
grapheme-prop))
|
|
|
|
|
|
|
|
(define (string-width str)
|
|
|
|
"Get the width of a string by adding up the widths of each grapheme"
|
|
|
|
(stream-fold
|
|
|
|
(λ (val grapheme)
|
|
|
|
(+ val (grapheme-width grapheme)))
|
|
|
|
0
|
|
|
|
(string->grapheme-stream str)))
|