guile-uniseg/uniseg.scm

106 lines
3.1 KiB
Scheme
Raw Normal View History

2024-03-03 18:13:22 +00:00
(define-module (uniseg)
#:use-module (srfi srfi-1)
#:use-module (ice-9 match)
2024-03-04 01:25:47 +00:00
#:use-module (srfi srfi-41)
2024-03-03 18:13:22 +00:00
#:use-module (uniseg graphemes)
2024-03-04 01:25:47 +00:00
#:use-module (uniseg graphemes stream)
#:use-module (uniseg charsets emoji)
#:use-module (uniseg charsets eastasian)
#:use-module (uniseg charsets graphemes)
2024-03-03 18:13:22 +00:00
#:export (emoji?
2024-03-04 01:25:47 +00:00
char->grapheme-property
char->eastasian-property
char-width
string-width))
2024-03-03 18:13:22 +00:00
(define (emoji? char)
2024-03-04 01:25:47 +00:00
(char-set-contains? char-set:emoji-extended-pictographic char))
2024-03-03 18:13:22 +00:00
2024-03-04 01:25:47 +00:00
(define (char->prop sets char)
2024-03-03 18:13:22 +00:00
(let ((pair
(find
(λ (p) (char-set-contains? (second p) char))
sets)))
(and pair
(first pair))))
2024-03-04 01:25:47 +00:00
(define (char->grapheme-property char)
2024-03-03 18:27:11 +00:00
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
2024-03-04 01:25:47 +00:00
(or (char->prop grapheme-charsets char)
2024-03-03 18:13:22 +00:00
'other))
2024-03-04 01:25:47 +00:00
(define (char->eastasian-property char)
2024-03-03 18:27:11 +00:00
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
2024-03-03 18:13:22 +00:00
(define (<==> lo hi)
"Create a range predicate, inclusive between two numbers"
(λ (num)
(and (>= num lo)
(<= num hi))))
2024-03-03 18:27:11 +00:00
;; The spec requires that any values not in the table be double-width, if
;; they are in certain ranges. So let's take care of that here.
;; These characters are not in the char-sets as they are undesignated.
;; See comments in unicode's EastAsianWidth.txt
2024-03-03 18:13:22 +00:00
(define range-unified-ideographs-ext-a?
(<==> #x3400 #x4DBF))
(define range-unified-ideographs?
(<==> #x4E00 #x9FFF))
(define range-compatibility-ideographs?
(<==> #xF900 #xFAFF))
(define range-plane-2?
(<==> #x20000 #x2FFFD))
(define range-plane-3?
(<==> #x30000 #x3FFFD))
2024-03-04 01:25:47 +00:00
(or (char->prop eastasian-charsets char)
2024-03-03 18:13:22 +00:00
(let ((int (char->integer char)))
(match int
((or (? range-unified-ideographs-ext-a?)
(? range-unified-ideographs?)
(? range-compatibility-ideographs?)
(? range-plane-2?)
(? range-plane-3?))
'doublewidth)
(else
'neutral)))))
2024-03-04 01:25:47 +00:00
(define (char-width rune)
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
(define grapheme-prop (char->grapheme-property rune))
(values
(or
(match grapheme-prop
((or 'control
'carriage-return
'line-feed
'extend
'zero-width-joiner)
0)
('regional-indicator
2)
('extended-pictographic
(if
(char-set-contains? char-set:emoji-presentation rune)
2
1))
(else #f)
)
(match (char->integer rune)
(#x2E3A 3)
(#x2E3B 4)
(else #f))
(match (char->eastasian-property rune)
('doublewidth 2)
;; Default for monospace is to render ambiguous as 1
('ambiguous 1)
(else 1)))
grapheme-prop))
(define (string-width str)
"Get the width of a string by adding up the widths of each grapheme"
(stream-fold
(λ (val grapheme)
(+ val (grapheme-delta-width grapheme)))
2024-03-04 01:25:47 +00:00
0
(string->grapheme-stream str)))