surrogate pair code

This commit is contained in:
XeroOl 2023-07-03 22:44:21 -05:00
parent 21bd635123
commit 47f1e819ce
2 changed files with 50 additions and 0 deletions

11
TODO.md
View File

@ -1,7 +1,18 @@
# Wishlist of features
([X] = complete, [ ] = planned)
code unit count (not to be confused with code points)
code point # |utf8 |utf16| offset
000000 - 00007F | 1 | 1 | 0
000080 - 0007FF | 2 | 1 | -1
000800 - 00FFFF | 3 | 1 | -2
010000 - 10FFFF | 4 | 2 | -2
utf8 chart
000000 - 00007F 0xxxxxxx
000080 - 0007FF 110xxxxx 10xxxxxx
000800 - 00FFFF 1110xxxx 10xxxxxx 10xxxxxx
010000 - 10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
My current goal is to work on completions a little bit more.
- [ ] Fix crash-files.test2

View File

@ -0,0 +1,39 @@
(fn byte->unit16 [str ?byte]
"convert from byte offset to unit16 offset. Does not work if string contains a new line"
(let [byte (or ?byte (length str))
substr (str:sub 1 byte)]
(accumulate
[total (accumulate
[total byte
_ (substr:gmatch "[\192-\223]")]
(- total 1))
_ (substr:gmatch "[\224-\247]")]
(- total 2))))
(fn unit16->byte [str unit16]
"convert from unit16 offset to byte offset. Does not work if string contains a new line"
;; TODO replace with faccumulate and :sub, because it is 70 times faster than gmatch
(accumulate
[(total ul) (values 0 unit16)
utf8-character (str:gmatch "[\000-\127\192-\255][\128-\191]*")
&until (<= ul 0)]
(let [len (length utf8-character)]
(values
(+ total len)
(- ul (case len
1 1
2 1
3 1
4 2
_ (error "invalid utf8")))))))
(print (byte->unit16 "aλb𐐀" 1) 1)
(print (byte->unit16 "aλb𐐀" 3) 2)
(print (byte->unit16 "aλb𐐀" 4) 3)
(print (byte->unit16 "aλb𐐀") 5)
(print (unit16->byte "aλb𐐀" 1) 1)
(print (unit16->byte "aλb𐐀" 2) 3)
(print (unit16->byte "aλb𐐀" 3) 4)
(print (unit16->byte "aλb𐐀" 5) 8)