Update valence_nbt to version 0.3.0 (#120)

Improves write performance and adds `binary_encoded_len` to compounds.
This commit is contained in:
Ryan Johnson 2022-10-17 21:11:20 -07:00 committed by GitHub
parent 9faac7a0fb
commit d22f1edae1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 250 additions and 70 deletions

View file

@ -79,14 +79,14 @@
trivial_casts,
trivial_numeric_casts,
unused_lifetimes,
unused_import_braces
unused_import_braces,
clippy::dbg_macro
)]
#![allow(
clippy::derive_partial_eq_without_eq,
clippy::unusual_byte_groupings,
clippy::comparison_chain
)]
#![deny(clippy::dbg_macro)]
/// Used on [`Config`](config::Config) to allow for async methods in traits.
///

View file

@ -6,7 +6,7 @@ repository = "https://github.com/valence-rs/valence/tree/main/valence_nbt"
readme = "README.md"
license = "MIT"
keywords = ["nbt", "minecraft", "serialization"]
version = "0.2.0"
version = "0.3.0"
authors = ["Ryan Johnson <ryanj00a@gmail.com>"]
edition = "2021"

View file

@ -3,6 +3,7 @@ use std::hash::Hash;
use std::iter::FusedIterator;
use std::ops::{Index, IndexMut};
use crate::to_binary_writer::encoded_len;
use crate::Value;
/// A map type with [`String`] keys and [`Value`] values.
@ -17,6 +18,22 @@ type Map = std::collections::BTreeMap<String, Value>;
#[cfg(feature = "preserve_order")]
type Map = indexmap::IndexMap<String, Value>;
impl Compound {
/// Returns the number of bytes that will be written with
/// [`to_binary_writer`] when called with this compound and root name.
///
/// If [`to_binary_writer`] results in `Ok`, the exact number of bytes
/// reported by this function will have been written.
///
/// If the result is `Err`, then the reported count will be greater than or
/// equal to the number of bytes that have actually been written.
///
/// [`to_binary_writer`]: crate::to_binary_writer()
pub fn binary_encoded_len(&self, root_name: &str) -> usize {
encoded_len(self, root_name)
}
}
impl Compound {
pub fn new() -> Self {
Self { map: Map::new() }

View file

@ -4,7 +4,7 @@ use byteorder::{BigEndian, ReadBytesExt};
use cesu8::Cesu8DecodingError;
use crate::tag::Tag;
use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
use crate::{Compound, Error, List, Result, Value};
/// Decodes uncompressed NBT binary data from the provided slice.
///
@ -27,6 +27,9 @@ pub fn from_binary_slice(slice: &mut &[u8]) -> Result<(Compound, String)> {
Ok((root, root_name))
}
/// Maximum recursion depth to prevent overflowing the call stack.
const MAX_DEPTH: usize = 512;
struct DecodeState<'a, 'b> {
slice: &'a mut &'b [u8],
/// Current recursion depth.

View file

@ -49,7 +49,22 @@
//! preserved during insertion and deletion at a slight cost to performance.
//! The iterators on `Compound` can then implement [`DoubleEndedIterator`].
#![deny(unsafe_code)]
#![deny(
rustdoc::broken_intra_doc_links,
rustdoc::private_intra_doc_links,
rustdoc::missing_crate_level_docs,
rustdoc::invalid_codeblock_attributes,
rustdoc::invalid_rust_codeblocks,
rustdoc::bare_urls
)]
#![warn(
trivial_casts,
trivial_numeric_casts,
unused_lifetimes,
unused_import_braces,
clippy::dbg_macro
)]
#![allow(clippy::unusual_byte_groupings)]
pub use compound::Compound;
pub use error::Error;
@ -60,6 +75,7 @@ pub use value::{List, Value};
pub mod compound;
mod error;
mod from_binary_slice;
mod modified_utf8;
mod to_binary_writer;
pub mod value;
@ -67,9 +83,6 @@ mod tag;
#[cfg(test)]
mod tests;
/// Maximum recursion depth to prevent overflowing the call stack.
const MAX_DEPTH: usize = 512;
type Result<T> = std::result::Result<T, Error>;
/// A convenience macro for constructing [`Compound`]s.

View file

@ -0,0 +1,128 @@
//! Utilities for working with Java's "Modified UTF-8" character encoding.
//!
//! For more information, refer to [Wikipedia].
//!
//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
use std::io;
use std::io::Write;
use std::str::from_utf8_unchecked;
use byteorder::{BigEndian, WriteBytesExt};
pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
0 => {
writer.write_u16::<BigEndian>(0xc080)?;
i += 1;
}
b @ 1..=127 => {
writer.write_u8(b)?;
i += 1;
}
b => {
let w = utf8_char_width(b);
debug_assert!(w <= 4);
debug_assert!(i + w <= bytes.len());
if w != 4 {
writer.write_all(&bytes[i..i + w])?;
} else {
let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
let c = s.chars().next().unwrap() as u32 - 0x10000;
let s0 = ((c >> 10) as u16) | 0xd800;
let s1 = ((c & 0x3ff) as u16) | 0xdc00;
writer.write_all(encode_surrogate(s0).as_slice())?;
writer.write_all(encode_surrogate(s1).as_slice())?;
}
i += w;
}
}
}
Ok(())
}
const fn utf8_char_width(first_byte: u8) -> usize {
const UTF8_CHAR_WIDTH: [u8; 256] = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
UTF8_CHAR_WIDTH[first_byte as usize] as _
}
fn encode_surrogate(surrogate: u16) -> [u8; 3] {
debug_assert!((0xd800..=0xdfff).contains(&surrogate));
const TAG_CONT_U8: u8 = 0b1000_0000u8;
[
0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
]
}
pub fn encoded_len(text: &str) -> usize {
let mut n = 0;
let mut i = 0;
let bytes = text.as_bytes();
while i < bytes.len() {
match bytes[i] {
0 => {
n += 2;
i += 1;
}
// Fast path for ASCII here makes a huge difference in benchmarks.
1..=127 => {
n += 1;
i += 1;
}
b => {
let w = utf8_char_width(b);
if w == 4 {
n += 6;
} else {
n += w;
}
i += w;
}
}
}
n
}
#[cfg(test)]
#[test]
fn equivalence() {
fn check(s: &str) {
let mut ours = Vec::new();
let theirs = cesu8::to_java_cesu8(s);
write_modified_utf8(&mut ours, s).unwrap();
assert_eq!(theirs, ours);
assert_eq!(theirs.len(), encoded_len(s));
}
check("Mary had a little lamb\0");
check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
check("ÅÆÇÈØõ÷£¥ý");
}

View file

@ -1,5 +1,3 @@
use std::mem;
use crate::tag::Tag;
use crate::{compound, from_binary_slice, to_binary_writer, Compound, List, Value};
@ -53,20 +51,6 @@ fn check_min_sizes() {
check(Value::LongArray([].into()), 4);
}
#[test]
fn deeply_nested_compound_encode() {
let mut c = compound!("" => 111_i8);
for _ in 0..10_000 {
c = compound!("" => c);
}
// Should not overflow the stack
let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
// Don"t overflow the stack while dropping.
mem::forget(c);
}
#[test]
fn deeply_nested_compound_decode() {
let mut buf = vec![Tag::Compound as u8, 0, 0]; // Root compound
@ -84,22 +68,6 @@ fn deeply_nested_compound_decode() {
let _ = from_binary_slice(&mut buf.as_slice());
}
#[test]
fn deeply_nested_list_encode() {
let mut l = List::Byte(Vec::new());
for _ in 0..10_000 {
l = List::List(vec![l]);
}
let c = compound!("" => l);
// Should not panic
let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
// Don"t overflow the stack while dropping.
mem::forget(c);
}
#[test]
fn deeply_nested_list_decode() {
// Root compound with one field.
@ -119,6 +87,16 @@ fn deeply_nested_list_decode() {
let _ = from_binary_slice(&mut buf.as_slice());
}
#[test]
fn correct_length() {
let c = example_compound();
let mut buf = Vec::new();
to_binary_writer(&mut buf, &c, "abc").unwrap();
assert_eq!(c.binary_encoded_len("abc"), buf.len());
}
#[cfg(feature = "preserve_order")]
#[test]
fn preserves_order() {

View file

@ -4,7 +4,7 @@ use byteorder::{BigEndian, WriteBytesExt};
use zerocopy::AsBytes;
use crate::tag::Tag;
use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
use crate::{modified_utf8, Compound, Error, List, Result, Value};
/// Encodes uncompressed NBT binary data to the provided writer.
///
@ -14,35 +14,71 @@ use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
/// Additionally, the root compound can be given a name. Typically the empty
/// string `""` is used.
pub fn to_binary_writer<W: Write>(writer: W, compound: &Compound, root_name: &str) -> Result<()> {
let mut state = EncodeState { writer, depth: 0 };
let mut state = EncodeState { writer };
state.write_tag(Tag::Compound)?;
state.write_string(root_name)?;
state.write_compound(compound)?;
debug_assert_eq!(state.depth, 0);
Ok(())
}
pub(crate) fn encoded_len(compound: &Compound, root_name: &str) -> usize {
fn value_len(val: &Value) -> usize {
match val {
Value::Byte(_) => 1,
Value::Short(_) => 2,
Value::Int(_) => 4,
Value::Long(_) => 8,
Value::Float(_) => 4,
Value::Double(_) => 8,
Value::ByteArray(ba) => 4 + ba.len(),
Value::String(s) => string_len(s),
Value::List(l) => list_len(l),
Value::Compound(c) => compound_len(c),
Value::IntArray(ia) => 4 + ia.len() * 4,
Value::LongArray(la) => 4 + la.len() * 8,
}
}
fn list_len(l: &List) -> usize {
let elems_len = match l {
List::Byte(b) => b.len(),
List::Short(s) => s.len() * 2,
List::Int(i) => i.len() * 4,
List::Long(l) => l.len() * 8,
List::Float(f) => f.len() * 4,
List::Double(d) => d.len() * 8,
List::ByteArray(ba) => ba.iter().map(|b| 4 + b.len()).sum(),
List::String(s) => s.iter().map(|s| string_len(s)).sum(),
List::List(l) => l.iter().map(list_len).sum(),
List::Compound(c) => c.iter().map(compound_len).sum(),
List::IntArray(i) => i.iter().map(|i| 4 + i.len() * 4).sum(),
List::LongArray(l) => l.iter().map(|l| 4 + l.len() * 8).sum(),
};
1 + 4 + elems_len
}
fn string_len(s: &str) -> usize {
2 + modified_utf8::encoded_len(s)
}
fn compound_len(c: &Compound) -> usize {
c.iter()
.map(|(k, v)| 1 + string_len(k) + value_len(v))
.sum::<usize>()
+ 1
}
1 + string_len(root_name) + compound_len(compound)
}
struct EncodeState<W> {
writer: W,
/// Current recursion depth.
depth: usize,
}
impl<W: Write> EncodeState<W> {
#[inline]
fn check_depth<T>(&mut self, f: impl FnOnce(&mut Self) -> Result<T>) -> Result<T> {
if self.depth >= MAX_DEPTH {
return Err(Error::new_static("reached maximum recursion depth"));
}
self.depth += 1;
let res = f(self);
self.depth -= 1;
res
}
fn write_tag(&mut self, tag: Tag) -> Result<()> {
Ok(self.writer.write_u8(tag as u8)?)
}
@ -57,8 +93,8 @@ impl<W: Write> EncodeState<W> {
Value::Double(d) => self.write_double(*d),
Value::ByteArray(ba) => self.write_byte_array(ba),
Value::String(s) => self.write_string(s),
Value::List(l) => self.check_depth(|st| st.write_any_list(l)),
Value::Compound(c) => self.check_depth(|st| st.write_compound(c)),
Value::List(l) => self.write_any_list(l),
Value::Compound(c) => self.write_compound(c),
Value::IntArray(ia) => self.write_int_array(ia),
Value::LongArray(la) => self.write_long_array(la),
}
@ -103,19 +139,27 @@ impl<W: Write> EncodeState<W> {
}
fn write_string(&mut self, s: &str) -> Result<()> {
let s = cesu8::to_java_cesu8(s);
let len = modified_utf8::encoded_len(s);
match s.len().try_into() {
Ok(len) => self.writer.write_u16::<BigEndian>(len)?,
match len.try_into() {
Ok(n) => self.writer.write_u16::<BigEndian>(n)?,
Err(_) => {
return Err(Error::new_owned(format!(
"string of length {} exceeds maximum of u16::MAX",
s.len()
"string of length {len} exceeds maximum of u16::MAX"
)))
}
}
Ok(self.writer.write_all(&s)?)
// Conversion to modified UTF-8 always increases the size of the string.
// If the new len is equal to the original len, we know it doesn't need
// to be re-encoded.
if len == s.len() {
self.writer.write_all(s.as_bytes())?;
} else {
modified_utf8::write_modified_utf8(&mut self.writer, s)?;
}
Ok(())
}
fn write_any_list(&mut self, list: &List) -> Result<()> {
@ -144,11 +188,8 @@ impl<W: Write> EncodeState<W> {
self.write_list(bal, Tag::ByteArray, |st, ba| st.write_byte_array(ba))
}
List::String(sl) => self.write_list(sl, Tag::String, |st, s| st.write_string(s)),
List::List(ll) => {
self.check_depth(|st| st.write_list(ll, Tag::List, |st, l| st.write_any_list(l)))
}
List::Compound(cl) => self
.check_depth(|st| st.write_list(cl, Tag::Compound, |st, c| st.write_compound(c))),
List::List(ll) => self.write_list(ll, Tag::List, |st, l| st.write_any_list(l)),
List::Compound(cl) => self.write_list(cl, Tag::Compound, |st, c| st.write_compound(c)),
List::IntArray(ial) => {
self.write_list(ial, Tag::IntArray, |st, ia| st.write_int_array(ia))
}