guile-uniseg/uniseg.scm
Vivianne Langdon 31012d5b8f Yet another reorganization, and solve Christine's 'rude problem'
- We change the stream iterator to *always* return a grapheme
  (except for EOF). The grapheme then gets built up over time.
  - This way, trans flag for example is first white flag, then
  white flag + zwj, etc until it finally transforms into the
  trans flag.
  - Users of the stream library can then use the `modification?'
  flag to determine if the stream value is a modification of the
  prior grapheme instead of a new grapheme.
  - Abstracted iteration to an iterator object to support use cases
  where we don't have an input stream (reflow needs this!)
2024-03-05 11:46:32 -05:00

105 lines
3.1 KiB
Scheme

(define-module (uniseg)
#:use-module (srfi srfi-1)
#:use-module (ice-9 match)
#:use-module (srfi srfi-41)
#:use-module (uniseg graphemes)
#:use-module (uniseg graphemes stream)
#:use-module (uniseg charsets emoji)
#:use-module (uniseg charsets eastasian)
#:use-module (uniseg charsets graphemes)
#:export (emoji?
char->grapheme-property
char->eastasian-property
char-width
string-width))
(define (emoji? char)
(char-set-contains? char-set:emoji-extended-pictographic char))
(define (char->prop sets char)
(let ((pair
(find
(λ (p) (char-set-contains? (second p) char))
sets)))
(and pair
(first pair))))
(define (char->grapheme-property char)
"Find the Unicode grapheme cluster property, as defined by https://www.unicode.org/reports/tr44/"
(or (char->prop grapheme-charsets char)
'other))
(define (char->eastasian-property char)
"Find the Unicode East Asian width property, as defined by https://www.unicode.org/reports/tr11/"
(define (<==> lo hi)
"Create a range predicate, inclusive between two numbers"
(λ (num)
(and (>= num lo)
(<= num hi))))
;; The spec requires that any values not in the table be double-width, if
;; they are in certain ranges. So let's take care of that here.
;; These characters are not in the char-sets as they are undesignated.
;; See comments in unicode's EastAsianWidth.txt
(define range-unified-ideographs-ext-a?
(<==> #x3400 #x4DBF))
(define range-unified-ideographs?
(<==> #x4E00 #x9FFF))
(define range-compatibility-ideographs?
(<==> #xF900 #xFAFF))
(define range-plane-2?
(<==> #x20000 #x2FFFD))
(define range-plane-3?
(<==> #x30000 #x3FFFD))
(or (char->prop eastasian-charsets char)
(let ((int (char->integer char)))
(match int
((or (? range-unified-ideographs-ext-a?)
(? range-unified-ideographs?)
(? range-compatibility-ideographs?)
(? range-plane-2?)
(? range-plane-3?))
'doublewidth)
(else
'neutral)))))
(define (char-width rune)
"Convert from a character to its visible width. Returns the width as well as the grapheme property to reduce calculations"
(define grapheme-prop (char->grapheme-property rune))
(values
(or
(match grapheme-prop
((or 'control
'carriage-return
'line-feed
'extend
'zero-width-joiner)
0)
('regional-indicator
2)
('extended-pictographic
(if
(char-set-contains? char-set:emoji-presentation rune)
2
1))
(else #f)
)
(match (char->integer rune)
(#x2E3A 3)
(#x2E3B 4)
(else #f))
(match (char->eastasian-property rune)
('doublewidth 2)
;; Default for monospace is to render ambiguous as 1
('ambiguous 1)
(else 1)))
grapheme-prop))
(define (string-width str)
"Get the width of a string by adding up the widths of each grapheme"
(stream-fold
(λ (val grapheme)
(+ val (grapheme-delta-width grapheme)))
0
(string->grapheme-stream str)))