From d22f1edae1c379a526d7093f0198e9481c043ee0 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 17 Oct 2022 21:11:20 -0700 Subject: [PATCH] Update valence_nbt to version 0.3.0 (#120) Improves write performance and adds `binary_encoded_len` to compounds. --- src/lib.rs | 4 +- valence_nbt/Cargo.toml | 2 +- valence_nbt/src/compound.rs | 17 ++++ valence_nbt/src/from_binary_slice.rs | 5 +- valence_nbt/src/lib.rs | 21 ++++- valence_nbt/src/modified_utf8.rs | 128 +++++++++++++++++++++++++++ valence_nbt/src/tests.rs | 42 +++------ valence_nbt/src/to_binary_writer.rs | 101 ++++++++++++++------- 8 files changed, 250 insertions(+), 70 deletions(-) create mode 100644 valence_nbt/src/modified_utf8.rs diff --git a/src/lib.rs b/src/lib.rs index fb73d9c..9e3d478 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,14 +79,14 @@ trivial_casts, trivial_numeric_casts, unused_lifetimes, - unused_import_braces + unused_import_braces, + clippy::dbg_macro )] #![allow( clippy::derive_partial_eq_without_eq, clippy::unusual_byte_groupings, clippy::comparison_chain )] -#![deny(clippy::dbg_macro)] /// Used on [`Config`](config::Config) to allow for async methods in traits. /// diff --git a/valence_nbt/Cargo.toml b/valence_nbt/Cargo.toml index f0aa7af..9db85c7 100644 --- a/valence_nbt/Cargo.toml +++ b/valence_nbt/Cargo.toml @@ -6,7 +6,7 @@ repository = "https://github.com/valence-rs/valence/tree/main/valence_nbt" readme = "README.md" license = "MIT" keywords = ["nbt", "minecraft", "serialization"] -version = "0.2.0" +version = "0.3.0" authors = ["Ryan Johnson "] edition = "2021" diff --git a/valence_nbt/src/compound.rs b/valence_nbt/src/compound.rs index f81b35d..af13fc4 100644 --- a/valence_nbt/src/compound.rs +++ b/valence_nbt/src/compound.rs @@ -3,6 +3,7 @@ use std::hash::Hash; use std::iter::FusedIterator; use std::ops::{Index, IndexMut}; +use crate::to_binary_writer::encoded_len; use crate::Value; /// A map type with [`String`] keys and [`Value`] values. @@ -17,6 +18,22 @@ type Map = std::collections::BTreeMap; #[cfg(feature = "preserve_order")] type Map = indexmap::IndexMap; +impl Compound { + /// Returns the number of bytes that will be written with + /// [`to_binary_writer`] when called with this compound and root name. + /// + /// If [`to_binary_writer`] results in `Ok`, the exact number of bytes + /// reported by this function will have been written. + /// + /// If the result is `Err`, then the reported count will be greater than or + /// equal to the number of bytes that have actually been written. + /// + /// [`to_binary_writer`]: crate::to_binary_writer() + pub fn binary_encoded_len(&self, root_name: &str) -> usize { + encoded_len(self, root_name) + } +} + impl Compound { pub fn new() -> Self { Self { map: Map::new() } diff --git a/valence_nbt/src/from_binary_slice.rs b/valence_nbt/src/from_binary_slice.rs index dea648b..f519909 100644 --- a/valence_nbt/src/from_binary_slice.rs +++ b/valence_nbt/src/from_binary_slice.rs @@ -4,7 +4,7 @@ use byteorder::{BigEndian, ReadBytesExt}; use cesu8::Cesu8DecodingError; use crate::tag::Tag; -use crate::{Compound, Error, List, Result, Value, MAX_DEPTH}; +use crate::{Compound, Error, List, Result, Value}; /// Decodes uncompressed NBT binary data from the provided slice. /// @@ -27,6 +27,9 @@ pub fn from_binary_slice(slice: &mut &[u8]) -> Result<(Compound, String)> { Ok((root, root_name)) } +/// Maximum recursion depth to prevent overflowing the call stack. +const MAX_DEPTH: usize = 512; + struct DecodeState<'a, 'b> { slice: &'a mut &'b [u8], /// Current recursion depth. diff --git a/valence_nbt/src/lib.rs b/valence_nbt/src/lib.rs index 797b027..5eb2bc0 100644 --- a/valence_nbt/src/lib.rs +++ b/valence_nbt/src/lib.rs @@ -49,7 +49,22 @@ //! preserved during insertion and deletion at a slight cost to performance. //! The iterators on `Compound` can then implement [`DoubleEndedIterator`]. -#![deny(unsafe_code)] +#![deny( + rustdoc::broken_intra_doc_links, + rustdoc::private_intra_doc_links, + rustdoc::missing_crate_level_docs, + rustdoc::invalid_codeblock_attributes, + rustdoc::invalid_rust_codeblocks, + rustdoc::bare_urls +)] +#![warn( + trivial_casts, + trivial_numeric_casts, + unused_lifetimes, + unused_import_braces, + clippy::dbg_macro +)] +#![allow(clippy::unusual_byte_groupings)] pub use compound::Compound; pub use error::Error; @@ -60,6 +75,7 @@ pub use value::{List, Value}; pub mod compound; mod error; mod from_binary_slice; +mod modified_utf8; mod to_binary_writer; pub mod value; @@ -67,9 +83,6 @@ mod tag; #[cfg(test)] mod tests; -/// Maximum recursion depth to prevent overflowing the call stack. -const MAX_DEPTH: usize = 512; - type Result = std::result::Result; /// A convenience macro for constructing [`Compound`]s. diff --git a/valence_nbt/src/modified_utf8.rs b/valence_nbt/src/modified_utf8.rs new file mode 100644 index 0000000..6904348 --- /dev/null +++ b/valence_nbt/src/modified_utf8.rs @@ -0,0 +1,128 @@ +//! Utilities for working with Java's "Modified UTF-8" character encoding. +//! +//! For more information, refer to [Wikipedia]. +//! +//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 + +use std::io; +use std::io::Write; +use std::str::from_utf8_unchecked; + +use byteorder::{BigEndian, WriteBytesExt}; + +pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> { + let bytes = text.as_bytes(); + let mut i = 0; + + while i < bytes.len() { + match bytes[i] { + 0 => { + writer.write_u16::(0xc080)?; + i += 1; + } + b @ 1..=127 => { + writer.write_u8(b)?; + i += 1; + } + b => { + let w = utf8_char_width(b); + debug_assert!(w <= 4); + debug_assert!(i + w <= bytes.len()); + + if w != 4 { + writer.write_all(&bytes[i..i + w])?; + } else { + let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) }; + let c = s.chars().next().unwrap() as u32 - 0x10000; + + let s0 = ((c >> 10) as u16) | 0xd800; + let s1 = ((c & 0x3ff) as u16) | 0xdc00; + + writer.write_all(encode_surrogate(s0).as_slice())?; + writer.write_all(encode_surrogate(s1).as_slice())?; + } + i += w; + } + } + } + + Ok(()) +} + +const fn utf8_char_width(first_byte: u8) -> usize { + const UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + UTF8_CHAR_WIDTH[first_byte as usize] as _ +} + +fn encode_surrogate(surrogate: u16) -> [u8; 3] { + debug_assert!((0xd800..=0xdfff).contains(&surrogate)); + + const TAG_CONT_U8: u8 = 0b1000_0000u8; + [ + 0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8, + TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8, + TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8, + ] +} + +pub fn encoded_len(text: &str) -> usize { + let mut n = 0; + let mut i = 0; + let bytes = text.as_bytes(); + + while i < bytes.len() { + match bytes[i] { + 0 => { + n += 2; + i += 1; + } + // Fast path for ASCII here makes a huge difference in benchmarks. + 1..=127 => { + n += 1; + i += 1; + } + b => { + let w = utf8_char_width(b); + + if w == 4 { + n += 6; + } else { + n += w; + } + + i += w; + } + } + } + + n +} + +#[cfg(test)] +#[test] +fn equivalence() { + fn check(s: &str) { + let mut ours = Vec::new(); + + let theirs = cesu8::to_java_cesu8(s); + write_modified_utf8(&mut ours, s).unwrap(); + + assert_eq!(theirs, ours); + assert_eq!(theirs.len(), encoded_len(s)); + } + + check("Mary had a little lamb\0"); + check("πŸ€‘πŸ’©πŸ‘»πŸ’€β˜ πŸ‘½πŸ‘ΎπŸ€–πŸŽƒπŸ˜ΊπŸ˜ΈπŸ˜ΉπŸ˜»πŸ˜ΌπŸ˜½πŸ™€πŸ˜ΏπŸ˜Ύ"); + check("Γ…Γ†Γ‡ΓˆΓ˜Γ΅Γ·Β£Β₯Γ½"); +} diff --git a/valence_nbt/src/tests.rs b/valence_nbt/src/tests.rs index 8beadd7..080a072 100644 --- a/valence_nbt/src/tests.rs +++ b/valence_nbt/src/tests.rs @@ -1,5 +1,3 @@ -use std::mem; - use crate::tag::Tag; use crate::{compound, from_binary_slice, to_binary_writer, Compound, List, Value}; @@ -53,20 +51,6 @@ fn check_min_sizes() { check(Value::LongArray([].into()), 4); } -#[test] -fn deeply_nested_compound_encode() { - let mut c = compound!("" => 111_i8); - for _ in 0..10_000 { - c = compound!("" => c); - } - - // Should not overflow the stack - let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME); - - // Don"t overflow the stack while dropping. - mem::forget(c); -} - #[test] fn deeply_nested_compound_decode() { let mut buf = vec![Tag::Compound as u8, 0, 0]; // Root compound @@ -84,22 +68,6 @@ fn deeply_nested_compound_decode() { let _ = from_binary_slice(&mut buf.as_slice()); } -#[test] -fn deeply_nested_list_encode() { - let mut l = List::Byte(Vec::new()); - for _ in 0..10_000 { - l = List::List(vec![l]); - } - - let c = compound!("" => l); - - // Should not panic - let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME); - - // Don"t overflow the stack while dropping. - mem::forget(c); -} - #[test] fn deeply_nested_list_decode() { // Root compound with one field. @@ -119,6 +87,16 @@ fn deeply_nested_list_decode() { let _ = from_binary_slice(&mut buf.as_slice()); } +#[test] +fn correct_length() { + let c = example_compound(); + + let mut buf = Vec::new(); + to_binary_writer(&mut buf, &c, "abc").unwrap(); + + assert_eq!(c.binary_encoded_len("abc"), buf.len()); +} + #[cfg(feature = "preserve_order")] #[test] fn preserves_order() { diff --git a/valence_nbt/src/to_binary_writer.rs b/valence_nbt/src/to_binary_writer.rs index a799b2c..0060c69 100644 --- a/valence_nbt/src/to_binary_writer.rs +++ b/valence_nbt/src/to_binary_writer.rs @@ -4,7 +4,7 @@ use byteorder::{BigEndian, WriteBytesExt}; use zerocopy::AsBytes; use crate::tag::Tag; -use crate::{Compound, Error, List, Result, Value, MAX_DEPTH}; +use crate::{modified_utf8, Compound, Error, List, Result, Value}; /// Encodes uncompressed NBT binary data to the provided writer. /// @@ -14,35 +14,71 @@ use crate::{Compound, Error, List, Result, Value, MAX_DEPTH}; /// Additionally, the root compound can be given a name. Typically the empty /// string `""` is used. pub fn to_binary_writer(writer: W, compound: &Compound, root_name: &str) -> Result<()> { - let mut state = EncodeState { writer, depth: 0 }; + let mut state = EncodeState { writer }; state.write_tag(Tag::Compound)?; state.write_string(root_name)?; state.write_compound(compound)?; - debug_assert_eq!(state.depth, 0); Ok(()) } +pub(crate) fn encoded_len(compound: &Compound, root_name: &str) -> usize { + fn value_len(val: &Value) -> usize { + match val { + Value::Byte(_) => 1, + Value::Short(_) => 2, + Value::Int(_) => 4, + Value::Long(_) => 8, + Value::Float(_) => 4, + Value::Double(_) => 8, + Value::ByteArray(ba) => 4 + ba.len(), + Value::String(s) => string_len(s), + Value::List(l) => list_len(l), + Value::Compound(c) => compound_len(c), + Value::IntArray(ia) => 4 + ia.len() * 4, + Value::LongArray(la) => 4 + la.len() * 8, + } + } + + fn list_len(l: &List) -> usize { + let elems_len = match l { + List::Byte(b) => b.len(), + List::Short(s) => s.len() * 2, + List::Int(i) => i.len() * 4, + List::Long(l) => l.len() * 8, + List::Float(f) => f.len() * 4, + List::Double(d) => d.len() * 8, + List::ByteArray(ba) => ba.iter().map(|b| 4 + b.len()).sum(), + List::String(s) => s.iter().map(|s| string_len(s)).sum(), + List::List(l) => l.iter().map(list_len).sum(), + List::Compound(c) => c.iter().map(compound_len).sum(), + List::IntArray(i) => i.iter().map(|i| 4 + i.len() * 4).sum(), + List::LongArray(l) => l.iter().map(|l| 4 + l.len() * 8).sum(), + }; + + 1 + 4 + elems_len + } + + fn string_len(s: &str) -> usize { + 2 + modified_utf8::encoded_len(s) + } + + fn compound_len(c: &Compound) -> usize { + c.iter() + .map(|(k, v)| 1 + string_len(k) + value_len(v)) + .sum::() + + 1 + } + + 1 + string_len(root_name) + compound_len(compound) +} + struct EncodeState { writer: W, - /// Current recursion depth. - depth: usize, } impl EncodeState { - #[inline] - fn check_depth(&mut self, f: impl FnOnce(&mut Self) -> Result) -> Result { - if self.depth >= MAX_DEPTH { - return Err(Error::new_static("reached maximum recursion depth")); - } - - self.depth += 1; - let res = f(self); - self.depth -= 1; - res - } - fn write_tag(&mut self, tag: Tag) -> Result<()> { Ok(self.writer.write_u8(tag as u8)?) } @@ -57,8 +93,8 @@ impl EncodeState { Value::Double(d) => self.write_double(*d), Value::ByteArray(ba) => self.write_byte_array(ba), Value::String(s) => self.write_string(s), - Value::List(l) => self.check_depth(|st| st.write_any_list(l)), - Value::Compound(c) => self.check_depth(|st| st.write_compound(c)), + Value::List(l) => self.write_any_list(l), + Value::Compound(c) => self.write_compound(c), Value::IntArray(ia) => self.write_int_array(ia), Value::LongArray(la) => self.write_long_array(la), } @@ -103,19 +139,27 @@ impl EncodeState { } fn write_string(&mut self, s: &str) -> Result<()> { - let s = cesu8::to_java_cesu8(s); + let len = modified_utf8::encoded_len(s); - match s.len().try_into() { - Ok(len) => self.writer.write_u16::(len)?, + match len.try_into() { + Ok(n) => self.writer.write_u16::(n)?, Err(_) => { return Err(Error::new_owned(format!( - "string of length {} exceeds maximum of u16::MAX", - s.len() + "string of length {len} exceeds maximum of u16::MAX" ))) } } - Ok(self.writer.write_all(&s)?) + // Conversion to modified UTF-8 always increases the size of the string. + // If the new len is equal to the original len, we know it doesn't need + // to be re-encoded. + if len == s.len() { + self.writer.write_all(s.as_bytes())?; + } else { + modified_utf8::write_modified_utf8(&mut self.writer, s)?; + } + + Ok(()) } fn write_any_list(&mut self, list: &List) -> Result<()> { @@ -144,11 +188,8 @@ impl EncodeState { self.write_list(bal, Tag::ByteArray, |st, ba| st.write_byte_array(ba)) } List::String(sl) => self.write_list(sl, Tag::String, |st, s| st.write_string(s)), - List::List(ll) => { - self.check_depth(|st| st.write_list(ll, Tag::List, |st, l| st.write_any_list(l))) - } - List::Compound(cl) => self - .check_depth(|st| st.write_list(cl, Tag::Compound, |st, c| st.write_compound(c))), + List::List(ll) => self.write_list(ll, Tag::List, |st, l| st.write_any_list(l)), + List::Compound(cl) => self.write_list(cl, Tag::Compound, |st, c| st.write_compound(c)), List::IntArray(ial) => { self.write_list(ial, Tag::IntArray, |st, ia| st.write_int_array(ia)) }