valence/valence_nbt/src/modified_utf8.rs

//! Utilities for working with Java's "Modified UTF-8" character encoding.
//!
//! For more information, refer to [Wikipedia].
//!
//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8

use std::io;
use std::io::Write;
use std::str::from_utf8_unchecked;

use byteorder::{BigEndian, WriteBytesExt};

pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
    let bytes = text.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        match bytes[i] {
            0 => {
                writer.write_u16::<BigEndian>(0xc080)?;
                i += 1;
            }
            b @ 1..=127 => {
                writer.write_u8(b)?;
                i += 1;
            }
            b => {
                let w = utf8_char_width(b);
                debug_assert!(w <= 4);
                debug_assert!(i + w <= bytes.len());

                if w != 4 {
                    writer.write_all(&bytes[i..i + w])?;
                } else {
                    let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
                    let c = s.chars().next().unwrap() as u32 - 0x10000;

                    let s0 = ((c >> 10) as u16) | 0xd800;
                    let s1 = ((c & 0x3ff) as u16) | 0xdc00;

                    writer.write_all(encode_surrogate(s0).as_slice())?;
                    writer.write_all(encode_surrogate(s1).as_slice())?;
                }
                i += w;
            }
        }
    }

    Ok(())
}

const fn utf8_char_width(first_byte: u8) -> usize {
    const UTF8_CHAR_WIDTH: [u8; 256] = [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    ];

    UTF8_CHAR_WIDTH[first_byte as usize] as _
}

fn encode_surrogate(surrogate: u16) -> [u8; 3] {
    debug_assert!((0xd800..=0xdfff).contains(&surrogate));

    const TAG_CONT_U8: u8 = 0b1000_0000u8;
    [
        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
        TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
        TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
    ]
}

pub fn encoded_len(text: &str) -> usize {
    let mut n = 0;
    let mut i = 0;
    let bytes = text.as_bytes();

    while i < bytes.len() {
        match bytes[i] {
            0 => {
                n += 2;
                i += 1;
            }
            // Fast path for ASCII here makes a huge difference in benchmarks.
            1..=127 => {
                n += 1;
                i += 1;
            }
            b => {
                let w = utf8_char_width(b);

                if w == 4 {
                    n += 6;
                } else {
                    n += w;
                }

                i += w;
            }
        }
    }

    n
}

#[cfg(test)]
#[test]
fn equivalence() {
    fn check(s: &str) {
        let mut ours = Vec::new();

        let theirs = cesu8::to_java_cesu8(s);
        write_modified_utf8(&mut ours, s).unwrap();

        assert_eq!(theirs, ours);
        assert_eq!(theirs.len(), encoded_len(s));
    }

    check("Mary had a little lamb\0");
    check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
    check("ÅÆÇÈØõ÷£¥ý");
}
Update valence_nbt to version 0.3.0 (#120) Improves write performance and adds `binary_encoded_len` to compounds. 2022-10-18 15:11:20 +11:00			`//! Utilities for working with Java's "Modified UTF-8" character encoding.`
			`//!`
			`//! For more information, refer to [Wikipedia].`
			`//!`
			`//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8`

			`use std::io;`
			`use std::io::Write;`
			`use std::str::from_utf8_unchecked;`

			`use byteorder::{BigEndian, WriteBytesExt};`

			`pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {`
			`let bytes = text.as_bytes();`
			`let mut i = 0;`

			`while i < bytes.len() {`
			`match bytes[i] {`
			`0 => {`
			`writer.write_u16::<BigEndian>(0xc080)?;`
			`i += 1;`
			`}`
			`b @ 1..=127 => {`
			`writer.write_u8(b)?;`
			`i += 1;`
			`}`
			`b => {`
			`let w = utf8_char_width(b);`
			`debug_assert!(w <= 4);`
			`debug_assert!(i + w <= bytes.len());`

			`if w != 4 {`
			`writer.write_all(&bytes[i..i + w])?;`
			`} else {`
			`let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };`
			`let c = s.chars().next().unwrap() as u32 - 0x10000;`

			`let s0 = ((c >> 10) as u16) \| 0xd800;`
			`let s1 = ((c & 0x3ff) as u16) \| 0xdc00;`

			`writer.write_all(encode_surrogate(s0).as_slice())?;`
			`writer.write_all(encode_surrogate(s1).as_slice())?;`
			`}`
			`i += w;`
			`}`
			`}`
			`}`

			`Ok(())`
			`}`

			`const fn utf8_char_width(first_byte: u8) -> usize {`
			`const UTF8_CHAR_WIDTH: [u8; 256] = [`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,`
			`2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,`
			`4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`];`

			`UTF8_CHAR_WIDTH[first_byte as usize] as _`
			`}`

			`fn encode_surrogate(surrogate: u16) -> [u8; 3] {`
			`debug_assert!((0xd800..=0xdfff).contains(&surrogate));`

			`const TAG_CONT_U8: u8 = 0b1000_0000u8;`
			`[`
			`0b11100000 \| ((surrogate & 0b11110000_00000000) >> 12) as u8,`
			`TAG_CONT_U8 \| ((surrogate & 0b00001111_11000000) >> 6) as u8,`
			`TAG_CONT_U8 \| (surrogate & 0b00000000_00111111) as u8,`
			`]`
			`}`

			`pub fn encoded_len(text: &str) -> usize {`
			`let mut n = 0;`
			`let mut i = 0;`
			`let bytes = text.as_bytes();`

			`while i < bytes.len() {`
			`match bytes[i] {`
			`0 => {`
			`n += 2;`
			`i += 1;`
			`}`
			`// Fast path for ASCII here makes a huge difference in benchmarks.`
			`1..=127 => {`
			`n += 1;`
			`i += 1;`
			`}`
			`b => {`
			`let w = utf8_char_width(b);`

			`if w == 4 {`
			`n += 6;`
			`} else {`
			`n += w;`
			`}`

			`i += w;`
			`}`
			`}`
			`}`

			`n`
			`}`

			`#[cfg(test)]`
			`#[test]`
			`fn equivalence() {`
			`fn check(s: &str) {`
			`let mut ours = Vec::new();`

			`let theirs = cesu8::to_java_cesu8(s);`
			`write_modified_utf8(&mut ours, s).unwrap();`

			`assert_eq!(theirs, ours);`
			`assert_eq!(theirs.len(), encoded_len(s));`
			`}`

			`check("Mary had a little lamb\0");`
			`check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");`
			`check("ÅÆÇÈØõ÷£¥ý");`
			`}`