From d22f1edae1c379a526d7093f0198e9481c043ee0 Mon Sep 17 00:00:00 2001
From: Ryan Johnson <ryanj00a@gmail.com>
Date: Mon, 17 Oct 2022 21:11:20 -0700
Subject: [PATCH] Update valence_nbt to version 0.3.0 (#120)

Improves write performance and adds `binary_encoded_len` to compounds.
---
 src/lib.rs                           |   4 +-
 valence_nbt/Cargo.toml               |   2 +-
 valence_nbt/src/compound.rs          |  17 ++++
 valence_nbt/src/from_binary_slice.rs |   5 +-
 valence_nbt/src/lib.rs               |  21 ++++-
 valence_nbt/src/modified_utf8.rs     | 128 +++++++++++++++++++++++++++
 valence_nbt/src/tests.rs             |  42 +++------
 valence_nbt/src/to_binary_writer.rs  | 101 ++++++++++++++-------
 8 files changed, 250 insertions(+), 70 deletions(-)
 create mode 100644 valence_nbt/src/modified_utf8.rs

diff --git a/src/lib.rs b/src/lib.rs
index fb73d9c..9e3d478 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,14 +79,14 @@
     trivial_casts,
     trivial_numeric_casts,
     unused_lifetimes,
-    unused_import_braces
+    unused_import_braces,
+    clippy::dbg_macro
 )]
 #![allow(
     clippy::derive_partial_eq_without_eq,
     clippy::unusual_byte_groupings,
     clippy::comparison_chain
 )]
-#![deny(clippy::dbg_macro)]
 
 /// Used on [`Config`](config::Config) to allow for async methods in traits.
 ///
diff --git a/valence_nbt/Cargo.toml b/valence_nbt/Cargo.toml
index f0aa7af..9db85c7 100644
--- a/valence_nbt/Cargo.toml
+++ b/valence_nbt/Cargo.toml
@@ -6,7 +6,7 @@ repository = "https://github.com/valence-rs/valence/tree/main/valence_nbt"
 readme = "README.md"
 license = "MIT"
 keywords = ["nbt", "minecraft", "serialization"]
-version = "0.2.0"
+version = "0.3.0"
 authors = ["Ryan Johnson <ryanj00a@gmail.com>"]
 edition = "2021"
 
diff --git a/valence_nbt/src/compound.rs b/valence_nbt/src/compound.rs
index f81b35d..af13fc4 100644
--- a/valence_nbt/src/compound.rs
+++ b/valence_nbt/src/compound.rs
@@ -3,6 +3,7 @@ use std::hash::Hash;
 use std::iter::FusedIterator;
 use std::ops::{Index, IndexMut};
 
+use crate::to_binary_writer::encoded_len;
 use crate::Value;
 
 /// A map type with [`String`] keys and [`Value`] values.
@@ -17,6 +18,22 @@ type Map = std::collections::BTreeMap<String, Value>;
 #[cfg(feature = "preserve_order")]
 type Map = indexmap::IndexMap<String, Value>;
 
+impl Compound {
+    /// Returns the number of bytes that will be written with
+    /// [`to_binary_writer`] when called with this compound and root name.
+    ///
+    /// If [`to_binary_writer`] results in `Ok`, the exact number of bytes
+    /// reported by this function will have been written.
+    ///
+    /// If the result is `Err`, then the reported count will be greater than or
+    /// equal to the number of bytes that have actually been written.
+    ///
+    /// [`to_binary_writer`]: crate::to_binary_writer()
+    pub fn binary_encoded_len(&self, root_name: &str) -> usize {
+        encoded_len(self, root_name)
+    }
+}
+
 impl Compound {
     pub fn new() -> Self {
         Self { map: Map::new() }
diff --git a/valence_nbt/src/from_binary_slice.rs b/valence_nbt/src/from_binary_slice.rs
index dea648b..f519909 100644
--- a/valence_nbt/src/from_binary_slice.rs
+++ b/valence_nbt/src/from_binary_slice.rs
@@ -4,7 +4,7 @@ use byteorder::{BigEndian, ReadBytesExt};
 use cesu8::Cesu8DecodingError;
 
 use crate::tag::Tag;
-use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
+use crate::{Compound, Error, List, Result, Value};
 
 /// Decodes uncompressed NBT binary data from the provided slice.
 ///
@@ -27,6 +27,9 @@ pub fn from_binary_slice(slice: &mut &[u8]) -> Result<(Compound, String)> {
     Ok((root, root_name))
 }
 
+/// Maximum recursion depth to prevent overflowing the call stack.
+const MAX_DEPTH: usize = 512;
+
 struct DecodeState<'a, 'b> {
     slice: &'a mut &'b [u8],
     /// Current recursion depth.
diff --git a/valence_nbt/src/lib.rs b/valence_nbt/src/lib.rs
index 797b027..5eb2bc0 100644
--- a/valence_nbt/src/lib.rs
+++ b/valence_nbt/src/lib.rs
@@ -49,7 +49,22 @@
 //! preserved during insertion and deletion at a slight cost to performance.
 //! The iterators on `Compound` can then implement [`DoubleEndedIterator`].
 
-#![deny(unsafe_code)]
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rustdoc::private_intra_doc_links,
+    rustdoc::missing_crate_level_docs,
+    rustdoc::invalid_codeblock_attributes,
+    rustdoc::invalid_rust_codeblocks,
+    rustdoc::bare_urls
+)]
+#![warn(
+    trivial_casts,
+    trivial_numeric_casts,
+    unused_lifetimes,
+    unused_import_braces,
+    clippy::dbg_macro
+)]
+#![allow(clippy::unusual_byte_groupings)]
 
 pub use compound::Compound;
 pub use error::Error;
@@ -60,6 +75,7 @@ pub use value::{List, Value};
 pub mod compound;
 mod error;
 mod from_binary_slice;
+mod modified_utf8;
 mod to_binary_writer;
 pub mod value;
 
@@ -67,9 +83,6 @@ mod tag;
 #[cfg(test)]
 mod tests;
 
-/// Maximum recursion depth to prevent overflowing the call stack.
-const MAX_DEPTH: usize = 512;
-
 type Result<T> = std::result::Result<T, Error>;
 
 /// A convenience macro for constructing [`Compound`]s.
diff --git a/valence_nbt/src/modified_utf8.rs b/valence_nbt/src/modified_utf8.rs
new file mode 100644
index 0000000..6904348
--- /dev/null
+++ b/valence_nbt/src/modified_utf8.rs
@@ -0,0 +1,128 @@
+//! Utilities for working with Java's "Modified UTF-8" character encoding.
+//!
+//! For more information, refer to [Wikipedia].
+//!
+//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
+
+use std::io;
+use std::io::Write;
+use std::str::from_utf8_unchecked;
+
+use byteorder::{BigEndian, WriteBytesExt};
+
+pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
+    let bytes = text.as_bytes();
+    let mut i = 0;
+
+    while i < bytes.len() {
+        match bytes[i] {
+            0 => {
+                writer.write_u16::<BigEndian>(0xc080)?;
+                i += 1;
+            }
+            b @ 1..=127 => {
+                writer.write_u8(b)?;
+                i += 1;
+            }
+            b => {
+                let w = utf8_char_width(b);
+                debug_assert!(w <= 4);
+                debug_assert!(i + w <= bytes.len());
+
+                if w != 4 {
+                    writer.write_all(&bytes[i..i + w])?;
+                } else {
+                    let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
+                    let c = s.chars().next().unwrap() as u32 - 0x10000;
+
+                    let s0 = ((c >> 10) as u16) | 0xd800;
+                    let s1 = ((c & 0x3ff) as u16) | 0xdc00;
+
+                    writer.write_all(encode_surrogate(s0).as_slice())?;
+                    writer.write_all(encode_surrogate(s1).as_slice())?;
+                }
+                i += w;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+const fn utf8_char_width(first_byte: u8) -> usize {
+    const UTF8_CHAR_WIDTH: [u8; 256] = [
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    ];
+
+    UTF8_CHAR_WIDTH[first_byte as usize] as _
+}
+
+fn encode_surrogate(surrogate: u16) -> [u8; 3] {
+    debug_assert!((0xd800..=0xdfff).contains(&surrogate));
+
+    const TAG_CONT_U8: u8 = 0b1000_0000u8;
+    [
+        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
+        TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
+        TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
+    ]
+}
+
+pub fn encoded_len(text: &str) -> usize {
+    let mut n = 0;
+    let mut i = 0;
+    let bytes = text.as_bytes();
+
+    while i < bytes.len() {
+        match bytes[i] {
+            0 => {
+                n += 2;
+                i += 1;
+            }
+            // Fast path for ASCII here makes a huge difference in benchmarks.
+            1..=127 => {
+                n += 1;
+                i += 1;
+            }
+            b => {
+                let w = utf8_char_width(b);
+
+                if w == 4 {
+                    n += 6;
+                } else {
+                    n += w;
+                }
+
+                i += w;
+            }
+        }
+    }
+
+    n
+}
+
+#[cfg(test)]
+#[test]
+fn equivalence() {
+    fn check(s: &str) {
+        let mut ours = Vec::new();
+
+        let theirs = cesu8::to_java_cesu8(s);
+        write_modified_utf8(&mut ours, s).unwrap();
+
+        assert_eq!(theirs, ours);
+        assert_eq!(theirs.len(), encoded_len(s));
+    }
+
+    check("Mary had a little lamb\0");
+    check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
+    check("ÅÆÇÈØõ÷£¥ý");
+}
diff --git a/valence_nbt/src/tests.rs b/valence_nbt/src/tests.rs
index 8beadd7..080a072 100644
--- a/valence_nbt/src/tests.rs
+++ b/valence_nbt/src/tests.rs
@@ -1,5 +1,3 @@
-use std::mem;
-
 use crate::tag::Tag;
 use crate::{compound, from_binary_slice, to_binary_writer, Compound, List, Value};
 
@@ -53,20 +51,6 @@ fn check_min_sizes() {
     check(Value::LongArray([].into()), 4);
 }
 
-#[test]
-fn deeply_nested_compound_encode() {
-    let mut c = compound!("" => 111_i8);
-    for _ in 0..10_000 {
-        c = compound!("" => c);
-    }
-
-    // Should not overflow the stack
-    let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
-
-    // Don"t overflow the stack while dropping.
-    mem::forget(c);
-}
-
 #[test]
 fn deeply_nested_compound_decode() {
     let mut buf = vec![Tag::Compound as u8, 0, 0]; // Root compound
@@ -84,22 +68,6 @@ fn deeply_nested_compound_decode() {
     let _ = from_binary_slice(&mut buf.as_slice());
 }
 
-#[test]
-fn deeply_nested_list_encode() {
-    let mut l = List::Byte(Vec::new());
-    for _ in 0..10_000 {
-        l = List::List(vec![l]);
-    }
-
-    let c = compound!("" => l);
-
-    // Should not panic
-    let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
-
-    // Don"t overflow the stack while dropping.
-    mem::forget(c);
-}
-
 #[test]
 fn deeply_nested_list_decode() {
     // Root compound with one field.
@@ -119,6 +87,16 @@ fn deeply_nested_list_decode() {
     let _ = from_binary_slice(&mut buf.as_slice());
 }
 
+#[test]
+fn correct_length() {
+    let c = example_compound();
+
+    let mut buf = Vec::new();
+    to_binary_writer(&mut buf, &c, "abc").unwrap();
+
+    assert_eq!(c.binary_encoded_len("abc"), buf.len());
+}
+
 #[cfg(feature = "preserve_order")]
 #[test]
 fn preserves_order() {
diff --git a/valence_nbt/src/to_binary_writer.rs b/valence_nbt/src/to_binary_writer.rs
index a799b2c..0060c69 100644
--- a/valence_nbt/src/to_binary_writer.rs
+++ b/valence_nbt/src/to_binary_writer.rs
@@ -4,7 +4,7 @@ use byteorder::{BigEndian, WriteBytesExt};
 use zerocopy::AsBytes;
 
 use crate::tag::Tag;
-use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
+use crate::{modified_utf8, Compound, Error, List, Result, Value};
 
 /// Encodes uncompressed NBT binary data to the provided writer.
 ///
@@ -14,35 +14,71 @@ use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
 /// Additionally, the root compound can be given a name. Typically the empty
 /// string `""` is used.
 pub fn to_binary_writer<W: Write>(writer: W, compound: &Compound, root_name: &str) -> Result<()> {
-    let mut state = EncodeState { writer, depth: 0 };
+    let mut state = EncodeState { writer };
 
     state.write_tag(Tag::Compound)?;
     state.write_string(root_name)?;
     state.write_compound(compound)?;
 
-    debug_assert_eq!(state.depth, 0);
     Ok(())
 }
 
+pub(crate) fn encoded_len(compound: &Compound, root_name: &str) -> usize {
+    fn value_len(val: &Value) -> usize {
+        match val {
+            Value::Byte(_) => 1,
+            Value::Short(_) => 2,
+            Value::Int(_) => 4,
+            Value::Long(_) => 8,
+            Value::Float(_) => 4,
+            Value::Double(_) => 8,
+            Value::ByteArray(ba) => 4 + ba.len(),
+            Value::String(s) => string_len(s),
+            Value::List(l) => list_len(l),
+            Value::Compound(c) => compound_len(c),
+            Value::IntArray(ia) => 4 + ia.len() * 4,
+            Value::LongArray(la) => 4 + la.len() * 8,
+        }
+    }
+
+    fn list_len(l: &List) -> usize {
+        let elems_len = match l {
+            List::Byte(b) => b.len(),
+            List::Short(s) => s.len() * 2,
+            List::Int(i) => i.len() * 4,
+            List::Long(l) => l.len() * 8,
+            List::Float(f) => f.len() * 4,
+            List::Double(d) => d.len() * 8,
+            List::ByteArray(ba) => ba.iter().map(|b| 4 + b.len()).sum(),
+            List::String(s) => s.iter().map(|s| string_len(s)).sum(),
+            List::List(l) => l.iter().map(list_len).sum(),
+            List::Compound(c) => c.iter().map(compound_len).sum(),
+            List::IntArray(i) => i.iter().map(|i| 4 + i.len() * 4).sum(),
+            List::LongArray(l) => l.iter().map(|l| 4 + l.len() * 8).sum(),
+        };
+
+        1 + 4 + elems_len
+    }
+
+    fn string_len(s: &str) -> usize {
+        2 + modified_utf8::encoded_len(s)
+    }
+
+    fn compound_len(c: &Compound) -> usize {
+        c.iter()
+            .map(|(k, v)| 1 + string_len(k) + value_len(v))
+            .sum::<usize>()
+            + 1
+    }
+
+    1 + string_len(root_name) + compound_len(compound)
+}
+
 struct EncodeState<W> {
     writer: W,
-    /// Current recursion depth.
-    depth: usize,
 }
 
 impl<W: Write> EncodeState<W> {
-    #[inline]
-    fn check_depth<T>(&mut self, f: impl FnOnce(&mut Self) -> Result<T>) -> Result<T> {
-        if self.depth >= MAX_DEPTH {
-            return Err(Error::new_static("reached maximum recursion depth"));
-        }
-
-        self.depth += 1;
-        let res = f(self);
-        self.depth -= 1;
-        res
-    }
-
     fn write_tag(&mut self, tag: Tag) -> Result<()> {
         Ok(self.writer.write_u8(tag as u8)?)
     }
@@ -57,8 +93,8 @@ impl<W: Write> EncodeState<W> {
             Value::Double(d) => self.write_double(*d),
             Value::ByteArray(ba) => self.write_byte_array(ba),
             Value::String(s) => self.write_string(s),
-            Value::List(l) => self.check_depth(|st| st.write_any_list(l)),
-            Value::Compound(c) => self.check_depth(|st| st.write_compound(c)),
+            Value::List(l) => self.write_any_list(l),
+            Value::Compound(c) => self.write_compound(c),
             Value::IntArray(ia) => self.write_int_array(ia),
             Value::LongArray(la) => self.write_long_array(la),
         }
@@ -103,19 +139,27 @@ impl<W: Write> EncodeState<W> {
     }
 
     fn write_string(&mut self, s: &str) -> Result<()> {
-        let s = cesu8::to_java_cesu8(s);
+        let len = modified_utf8::encoded_len(s);
 
-        match s.len().try_into() {
-            Ok(len) => self.writer.write_u16::<BigEndian>(len)?,
+        match len.try_into() {
+            Ok(n) => self.writer.write_u16::<BigEndian>(n)?,
             Err(_) => {
                 return Err(Error::new_owned(format!(
-                    "string of length {} exceeds maximum of u16::MAX",
-                    s.len()
+                    "string of length {len} exceeds maximum of u16::MAX"
                 )))
             }
         }
 
-        Ok(self.writer.write_all(&s)?)
+        // Conversion to modified UTF-8 always increases the size of the string.
+        // If the new len is equal to the original len, we know it doesn't need
+        // to be re-encoded.
+        if len == s.len() {
+            self.writer.write_all(s.as_bytes())?;
+        } else {
+            modified_utf8::write_modified_utf8(&mut self.writer, s)?;
+        }
+
+        Ok(())
     }
 
     fn write_any_list(&mut self, list: &List) -> Result<()> {
@@ -144,11 +188,8 @@ impl<W: Write> EncodeState<W> {
                 self.write_list(bal, Tag::ByteArray, |st, ba| st.write_byte_array(ba))
             }
             List::String(sl) => self.write_list(sl, Tag::String, |st, s| st.write_string(s)),
-            List::List(ll) => {
-                self.check_depth(|st| st.write_list(ll, Tag::List, |st, l| st.write_any_list(l)))
-            }
-            List::Compound(cl) => self
-                .check_depth(|st| st.write_list(cl, Tag::Compound, |st, c| st.write_compound(c))),
+            List::List(ll) => self.write_list(ll, Tag::List, |st, l| st.write_any_list(l)),
+            List::Compound(cl) => self.write_list(cl, Tag::Compound, |st, c| st.write_compound(c)),
             List::IntArray(ial) => {
                 self.write_list(ial, Tag::IntArray, |st, ia| st.write_int_array(ia))
             }