mirror of
https://github.com/italicsjenga/valence.git
synced 2025-01-27 05:56:33 +11:00
Update valence_nbt to version 0.3.0 (#120)
Improves write performance and adds `binary_encoded_len` to compounds.
This commit is contained in:
parent
9faac7a0fb
commit
d22f1edae1
8 changed files with 250 additions and 70 deletions
|
@ -79,14 +79,14 @@
|
|||
trivial_casts,
|
||||
trivial_numeric_casts,
|
||||
unused_lifetimes,
|
||||
unused_import_braces
|
||||
unused_import_braces,
|
||||
clippy::dbg_macro
|
||||
)]
|
||||
#![allow(
|
||||
clippy::derive_partial_eq_without_eq,
|
||||
clippy::unusual_byte_groupings,
|
||||
clippy::comparison_chain
|
||||
)]
|
||||
#![deny(clippy::dbg_macro)]
|
||||
|
||||
/// Used on [`Config`](config::Config) to allow for async methods in traits.
|
||||
///
|
||||
|
|
|
@ -6,7 +6,7 @@ repository = "https://github.com/valence-rs/valence/tree/main/valence_nbt"
|
|||
readme = "README.md"
|
||||
license = "MIT"
|
||||
keywords = ["nbt", "minecraft", "serialization"]
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
authors = ["Ryan Johnson <ryanj00a@gmail.com>"]
|
||||
edition = "2021"
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ use std::hash::Hash;
|
|||
use std::iter::FusedIterator;
|
||||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use crate::to_binary_writer::encoded_len;
|
||||
use crate::Value;
|
||||
|
||||
/// A map type with [`String`] keys and [`Value`] values.
|
||||
|
@ -17,6 +18,22 @@ type Map = std::collections::BTreeMap<String, Value>;
|
|||
#[cfg(feature = "preserve_order")]
|
||||
type Map = indexmap::IndexMap<String, Value>;
|
||||
|
||||
impl Compound {
|
||||
/// Returns the number of bytes that will be written with
|
||||
/// [`to_binary_writer`] when called with this compound and root name.
|
||||
///
|
||||
/// If [`to_binary_writer`] results in `Ok`, the exact number of bytes
|
||||
/// reported by this function will have been written.
|
||||
///
|
||||
/// If the result is `Err`, then the reported count will be greater than or
|
||||
/// equal to the number of bytes that have actually been written.
|
||||
///
|
||||
/// [`to_binary_writer`]: crate::to_binary_writer()
|
||||
pub fn binary_encoded_len(&self, root_name: &str) -> usize {
|
||||
encoded_len(self, root_name)
|
||||
}
|
||||
}
|
||||
|
||||
impl Compound {
|
||||
pub fn new() -> Self {
|
||||
Self { map: Map::new() }
|
||||
|
|
|
@ -4,7 +4,7 @@ use byteorder::{BigEndian, ReadBytesExt};
|
|||
use cesu8::Cesu8DecodingError;
|
||||
|
||||
use crate::tag::Tag;
|
||||
use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
|
||||
use crate::{Compound, Error, List, Result, Value};
|
||||
|
||||
/// Decodes uncompressed NBT binary data from the provided slice.
|
||||
///
|
||||
|
@ -27,6 +27,9 @@ pub fn from_binary_slice(slice: &mut &[u8]) -> Result<(Compound, String)> {
|
|||
Ok((root, root_name))
|
||||
}
|
||||
|
||||
/// Maximum recursion depth to prevent overflowing the call stack.
|
||||
const MAX_DEPTH: usize = 512;
|
||||
|
||||
struct DecodeState<'a, 'b> {
|
||||
slice: &'a mut &'b [u8],
|
||||
/// Current recursion depth.
|
||||
|
|
|
@ -49,7 +49,22 @@
|
|||
//! preserved during insertion and deletion at a slight cost to performance.
|
||||
//! The iterators on `Compound` can then implement [`DoubleEndedIterator`].
|
||||
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(
|
||||
rustdoc::broken_intra_doc_links,
|
||||
rustdoc::private_intra_doc_links,
|
||||
rustdoc::missing_crate_level_docs,
|
||||
rustdoc::invalid_codeblock_attributes,
|
||||
rustdoc::invalid_rust_codeblocks,
|
||||
rustdoc::bare_urls
|
||||
)]
|
||||
#![warn(
|
||||
trivial_casts,
|
||||
trivial_numeric_casts,
|
||||
unused_lifetimes,
|
||||
unused_import_braces,
|
||||
clippy::dbg_macro
|
||||
)]
|
||||
#![allow(clippy::unusual_byte_groupings)]
|
||||
|
||||
pub use compound::Compound;
|
||||
pub use error::Error;
|
||||
|
@ -60,6 +75,7 @@ pub use value::{List, Value};
|
|||
pub mod compound;
|
||||
mod error;
|
||||
mod from_binary_slice;
|
||||
mod modified_utf8;
|
||||
mod to_binary_writer;
|
||||
pub mod value;
|
||||
|
||||
|
@ -67,9 +83,6 @@ mod tag;
|
|||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// Maximum recursion depth to prevent overflowing the call stack.
|
||||
const MAX_DEPTH: usize = 512;
|
||||
|
||||
type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// A convenience macro for constructing [`Compound`]s.
|
||||
|
|
128
valence_nbt/src/modified_utf8.rs
Normal file
128
valence_nbt/src/modified_utf8.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
//! Utilities for working with Java's "Modified UTF-8" character encoding.
|
||||
//!
|
||||
//! For more information, refer to [Wikipedia].
|
||||
//!
|
||||
//! [Wikipedia]: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::str::from_utf8_unchecked;
|
||||
|
||||
use byteorder::{BigEndian, WriteBytesExt};
|
||||
|
||||
pub fn write_modified_utf8(mut writer: impl Write, text: &str) -> io::Result<()> {
|
||||
let bytes = text.as_bytes();
|
||||
let mut i = 0;
|
||||
|
||||
while i < bytes.len() {
|
||||
match bytes[i] {
|
||||
0 => {
|
||||
writer.write_u16::<BigEndian>(0xc080)?;
|
||||
i += 1;
|
||||
}
|
||||
b @ 1..=127 => {
|
||||
writer.write_u8(b)?;
|
||||
i += 1;
|
||||
}
|
||||
b => {
|
||||
let w = utf8_char_width(b);
|
||||
debug_assert!(w <= 4);
|
||||
debug_assert!(i + w <= bytes.len());
|
||||
|
||||
if w != 4 {
|
||||
writer.write_all(&bytes[i..i + w])?;
|
||||
} else {
|
||||
let s = unsafe { from_utf8_unchecked(&bytes[i..i + w]) };
|
||||
let c = s.chars().next().unwrap() as u32 - 0x10000;
|
||||
|
||||
let s0 = ((c >> 10) as u16) | 0xd800;
|
||||
let s1 = ((c & 0x3ff) as u16) | 0xdc00;
|
||||
|
||||
writer.write_all(encode_surrogate(s0).as_slice())?;
|
||||
writer.write_all(encode_surrogate(s1).as_slice())?;
|
||||
}
|
||||
i += w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const fn utf8_char_width(first_byte: u8) -> usize {
|
||||
const UTF8_CHAR_WIDTH: [u8; 256] = [
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
|
||||
UTF8_CHAR_WIDTH[first_byte as usize] as _
|
||||
}
|
||||
|
||||
fn encode_surrogate(surrogate: u16) -> [u8; 3] {
|
||||
debug_assert!((0xd800..=0xdfff).contains(&surrogate));
|
||||
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000u8;
|
||||
[
|
||||
0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
|
||||
TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
|
||||
TAG_CONT_U8 | (surrogate & 0b00000000_00111111) as u8,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn encoded_len(text: &str) -> usize {
|
||||
let mut n = 0;
|
||||
let mut i = 0;
|
||||
let bytes = text.as_bytes();
|
||||
|
||||
while i < bytes.len() {
|
||||
match bytes[i] {
|
||||
0 => {
|
||||
n += 2;
|
||||
i += 1;
|
||||
}
|
||||
// Fast path for ASCII here makes a huge difference in benchmarks.
|
||||
1..=127 => {
|
||||
n += 1;
|
||||
i += 1;
|
||||
}
|
||||
b => {
|
||||
let w = utf8_char_width(b);
|
||||
|
||||
if w == 4 {
|
||||
n += 6;
|
||||
} else {
|
||||
n += w;
|
||||
}
|
||||
|
||||
i += w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
n
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[test]
|
||||
fn equivalence() {
|
||||
fn check(s: &str) {
|
||||
let mut ours = Vec::new();
|
||||
|
||||
let theirs = cesu8::to_java_cesu8(s);
|
||||
write_modified_utf8(&mut ours, s).unwrap();
|
||||
|
||||
assert_eq!(theirs, ours);
|
||||
assert_eq!(theirs.len(), encoded_len(s));
|
||||
}
|
||||
|
||||
check("Mary had a little lamb\0");
|
||||
check("🤡💩👻💀☠👽👾🤖🎃😺😸😹😻😼😽🙀😿😾");
|
||||
check("ÅÆÇÈØõ÷£¥ý");
|
||||
}
|
|
@ -1,5 +1,3 @@
|
|||
use std::mem;
|
||||
|
||||
use crate::tag::Tag;
|
||||
use crate::{compound, from_binary_slice, to_binary_writer, Compound, List, Value};
|
||||
|
||||
|
@ -53,20 +51,6 @@ fn check_min_sizes() {
|
|||
check(Value::LongArray([].into()), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_compound_encode() {
|
||||
let mut c = compound!("" => 111_i8);
|
||||
for _ in 0..10_000 {
|
||||
c = compound!("" => c);
|
||||
}
|
||||
|
||||
// Should not overflow the stack
|
||||
let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
|
||||
|
||||
// Don"t overflow the stack while dropping.
|
||||
mem::forget(c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_compound_decode() {
|
||||
let mut buf = vec![Tag::Compound as u8, 0, 0]; // Root compound
|
||||
|
@ -84,22 +68,6 @@ fn deeply_nested_compound_decode() {
|
|||
let _ = from_binary_slice(&mut buf.as_slice());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_list_encode() {
|
||||
let mut l = List::Byte(Vec::new());
|
||||
for _ in 0..10_000 {
|
||||
l = List::List(vec![l]);
|
||||
}
|
||||
|
||||
let c = compound!("" => l);
|
||||
|
||||
// Should not panic
|
||||
let _ = to_binary_writer(&mut Vec::new(), &c, ROOT_NAME);
|
||||
|
||||
// Don"t overflow the stack while dropping.
|
||||
mem::forget(c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_list_decode() {
|
||||
// Root compound with one field.
|
||||
|
@ -119,6 +87,16 @@ fn deeply_nested_list_decode() {
|
|||
let _ = from_binary_slice(&mut buf.as_slice());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn correct_length() {
|
||||
let c = example_compound();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
to_binary_writer(&mut buf, &c, "abc").unwrap();
|
||||
|
||||
assert_eq!(c.binary_encoded_len("abc"), buf.len());
|
||||
}
|
||||
|
||||
#[cfg(feature = "preserve_order")]
|
||||
#[test]
|
||||
fn preserves_order() {
|
||||
|
|
|
@ -4,7 +4,7 @@ use byteorder::{BigEndian, WriteBytesExt};
|
|||
use zerocopy::AsBytes;
|
||||
|
||||
use crate::tag::Tag;
|
||||
use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
|
||||
use crate::{modified_utf8, Compound, Error, List, Result, Value};
|
||||
|
||||
/// Encodes uncompressed NBT binary data to the provided writer.
|
||||
///
|
||||
|
@ -14,35 +14,71 @@ use crate::{Compound, Error, List, Result, Value, MAX_DEPTH};
|
|||
/// Additionally, the root compound can be given a name. Typically the empty
|
||||
/// string `""` is used.
|
||||
pub fn to_binary_writer<W: Write>(writer: W, compound: &Compound, root_name: &str) -> Result<()> {
|
||||
let mut state = EncodeState { writer, depth: 0 };
|
||||
let mut state = EncodeState { writer };
|
||||
|
||||
state.write_tag(Tag::Compound)?;
|
||||
state.write_string(root_name)?;
|
||||
state.write_compound(compound)?;
|
||||
|
||||
debug_assert_eq!(state.depth, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn encoded_len(compound: &Compound, root_name: &str) -> usize {
|
||||
fn value_len(val: &Value) -> usize {
|
||||
match val {
|
||||
Value::Byte(_) => 1,
|
||||
Value::Short(_) => 2,
|
||||
Value::Int(_) => 4,
|
||||
Value::Long(_) => 8,
|
||||
Value::Float(_) => 4,
|
||||
Value::Double(_) => 8,
|
||||
Value::ByteArray(ba) => 4 + ba.len(),
|
||||
Value::String(s) => string_len(s),
|
||||
Value::List(l) => list_len(l),
|
||||
Value::Compound(c) => compound_len(c),
|
||||
Value::IntArray(ia) => 4 + ia.len() * 4,
|
||||
Value::LongArray(la) => 4 + la.len() * 8,
|
||||
}
|
||||
}
|
||||
|
||||
fn list_len(l: &List) -> usize {
|
||||
let elems_len = match l {
|
||||
List::Byte(b) => b.len(),
|
||||
List::Short(s) => s.len() * 2,
|
||||
List::Int(i) => i.len() * 4,
|
||||
List::Long(l) => l.len() * 8,
|
||||
List::Float(f) => f.len() * 4,
|
||||
List::Double(d) => d.len() * 8,
|
||||
List::ByteArray(ba) => ba.iter().map(|b| 4 + b.len()).sum(),
|
||||
List::String(s) => s.iter().map(|s| string_len(s)).sum(),
|
||||
List::List(l) => l.iter().map(list_len).sum(),
|
||||
List::Compound(c) => c.iter().map(compound_len).sum(),
|
||||
List::IntArray(i) => i.iter().map(|i| 4 + i.len() * 4).sum(),
|
||||
List::LongArray(l) => l.iter().map(|l| 4 + l.len() * 8).sum(),
|
||||
};
|
||||
|
||||
1 + 4 + elems_len
|
||||
}
|
||||
|
||||
fn string_len(s: &str) -> usize {
|
||||
2 + modified_utf8::encoded_len(s)
|
||||
}
|
||||
|
||||
fn compound_len(c: &Compound) -> usize {
|
||||
c.iter()
|
||||
.map(|(k, v)| 1 + string_len(k) + value_len(v))
|
||||
.sum::<usize>()
|
||||
+ 1
|
||||
}
|
||||
|
||||
1 + string_len(root_name) + compound_len(compound)
|
||||
}
|
||||
|
||||
struct EncodeState<W> {
|
||||
writer: W,
|
||||
/// Current recursion depth.
|
||||
depth: usize,
|
||||
}
|
||||
|
||||
impl<W: Write> EncodeState<W> {
|
||||
#[inline]
|
||||
fn check_depth<T>(&mut self, f: impl FnOnce(&mut Self) -> Result<T>) -> Result<T> {
|
||||
if self.depth >= MAX_DEPTH {
|
||||
return Err(Error::new_static("reached maximum recursion depth"));
|
||||
}
|
||||
|
||||
self.depth += 1;
|
||||
let res = f(self);
|
||||
self.depth -= 1;
|
||||
res
|
||||
}
|
||||
|
||||
fn write_tag(&mut self, tag: Tag) -> Result<()> {
|
||||
Ok(self.writer.write_u8(tag as u8)?)
|
||||
}
|
||||
|
@ -57,8 +93,8 @@ impl<W: Write> EncodeState<W> {
|
|||
Value::Double(d) => self.write_double(*d),
|
||||
Value::ByteArray(ba) => self.write_byte_array(ba),
|
||||
Value::String(s) => self.write_string(s),
|
||||
Value::List(l) => self.check_depth(|st| st.write_any_list(l)),
|
||||
Value::Compound(c) => self.check_depth(|st| st.write_compound(c)),
|
||||
Value::List(l) => self.write_any_list(l),
|
||||
Value::Compound(c) => self.write_compound(c),
|
||||
Value::IntArray(ia) => self.write_int_array(ia),
|
||||
Value::LongArray(la) => self.write_long_array(la),
|
||||
}
|
||||
|
@ -103,19 +139,27 @@ impl<W: Write> EncodeState<W> {
|
|||
}
|
||||
|
||||
fn write_string(&mut self, s: &str) -> Result<()> {
|
||||
let s = cesu8::to_java_cesu8(s);
|
||||
let len = modified_utf8::encoded_len(s);
|
||||
|
||||
match s.len().try_into() {
|
||||
Ok(len) => self.writer.write_u16::<BigEndian>(len)?,
|
||||
match len.try_into() {
|
||||
Ok(n) => self.writer.write_u16::<BigEndian>(n)?,
|
||||
Err(_) => {
|
||||
return Err(Error::new_owned(format!(
|
||||
"string of length {} exceeds maximum of u16::MAX",
|
||||
s.len()
|
||||
"string of length {len} exceeds maximum of u16::MAX"
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.writer.write_all(&s)?)
|
||||
// Conversion to modified UTF-8 always increases the size of the string.
|
||||
// If the new len is equal to the original len, we know it doesn't need
|
||||
// to be re-encoded.
|
||||
if len == s.len() {
|
||||
self.writer.write_all(s.as_bytes())?;
|
||||
} else {
|
||||
modified_utf8::write_modified_utf8(&mut self.writer, s)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_any_list(&mut self, list: &List) -> Result<()> {
|
||||
|
@ -144,11 +188,8 @@ impl<W: Write> EncodeState<W> {
|
|||
self.write_list(bal, Tag::ByteArray, |st, ba| st.write_byte_array(ba))
|
||||
}
|
||||
List::String(sl) => self.write_list(sl, Tag::String, |st, s| st.write_string(s)),
|
||||
List::List(ll) => {
|
||||
self.check_depth(|st| st.write_list(ll, Tag::List, |st, l| st.write_any_list(l)))
|
||||
}
|
||||
List::Compound(cl) => self
|
||||
.check_depth(|st| st.write_list(cl, Tag::Compound, |st, c| st.write_compound(c))),
|
||||
List::List(ll) => self.write_list(ll, Tag::List, |st, l| st.write_any_list(l)),
|
||||
List::Compound(cl) => self.write_list(cl, Tag::Compound, |st, c| st.write_compound(c)),
|
||||
List::IntArray(ial) => {
|
||||
self.write_list(ial, Tag::IntArray, |st, ia| st.write_int_array(ia))
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue