mirror of
https://github.com/italicsjenga/valence.git
synced 2024-12-23 22:41:30 +11:00
d22f1edae1
Improves write performance and adds `binary_encoded_len` to compounds.
129 lines
3.9 KiB
Rust
129 lines
3.9 KiB
Rust
//! Utilities for working with Java's "Modified UTF-8" character encoding.
|
|
//!
|
|
//! For more information, refer to [Wikipedia].
|
|
//!
|
|
//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
|
|
|
|
use std::io;
|
|
use std::io::Write;
|
|
use std::str::from_utf8_unchecked;
|
|
|
|
use byteorder::{BigEndian, WriteBytesExt};
|
|
|
|
pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
|
|
let bytes = text.as_bytes();
|
|
let mut i = 0;
|
|
|
|
while i < bytes.len() {
|
|
match bytes[i] {
|
|
0 => {
|
|
writer.write_u16::<BigEndian>(0xc080)?;
|
|
i += 1;
|
|
}
|
|
b @ 1..=127 => {
|
|
writer.write_u8(b)?;
|
|
i += 1;
|
|
}
|
|
b => {
|
|
let w = utf8_char_width(b);
|
|
debug_assert!(w <= 4);
|
|
debug_assert!(i + w <= bytes.len());
|
|
|
|
if w != 4 {
|
|
writer.write_all(&bytes[i..i + w])?;
|
|
} else {
|
|
let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
|
|
let c = s.chars().next().unwrap() as u32 - 0x10000;
|
|
|
|
let s0 = ((c >> 10) as u16) | 0xd800;
|
|
let s1 = ((c & 0x3ff) as u16) | 0xdc00;
|
|
|
|
writer.write_all(encode_surrogate(s0).as_slice())?;
|
|
writer.write_all(encode_surrogate(s1).as_slice())?;
|
|
}
|
|
i += w;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
const fn utf8_char_width(first_byte: u8) -> usize {
|
|
const UTF8_CHAR_WIDTH: [u8; 256] = [
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
];
|
|
|
|
UTF8_CHAR_WIDTH[first_byte as usize] as _
|
|
}
|
|
|
|
fn encode_surrogate(surrogate: u16) -> [u8; 3] {
|
|
debug_assert!((0xd800..=0xdfff).contains(&surrogate));
|
|
|
|
const TAG_CONT_U8: u8 = 0b1000_0000u8;
|
|
[
|
|
0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
|
|
TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
|
|
TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
|
|
]
|
|
}
|
|
|
|
pub fn encoded_len(text: &str) -> usize {
|
|
let mut n = 0;
|
|
let mut i = 0;
|
|
let bytes = text.as_bytes();
|
|
|
|
while i < bytes.len() {
|
|
match bytes[i] {
|
|
0 => {
|
|
n += 2;
|
|
i += 1;
|
|
}
|
|
// Fast path for ASCII here makes a huge difference in benchmarks.
|
|
1..=127 => {
|
|
n += 1;
|
|
i += 1;
|
|
}
|
|
b => {
|
|
let w = utf8_char_width(b);
|
|
|
|
if w == 4 {
|
|
n += 6;
|
|
} else {
|
|
n += w;
|
|
}
|
|
|
|
i += w;
|
|
}
|
|
}
|
|
}
|
|
|
|
n
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[test]
|
|
fn equivalence() {
|
|
fn check(s: &str) {
|
|
let mut ours = Vec::new();
|
|
|
|
let theirs = cesu8::to_java_cesu8(s);
|
|
write_modified_utf8(&mut ours, s).unwrap();
|
|
|
|
assert_eq!(theirs, ours);
|
|
assert_eq!(theirs.len(), encoded_len(s));
|
|
}
|
|
|
|
check("Mary had a little lamb\0");
|
|
check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
|
|
check("ÅÆÇÈØõ÷£¥ý");
|
|
}
|