1#![allow(missing_docs)]
2use bytes::{Bytes, BytesMut};
3use encoding_rs::{CoderResult, Encoding};
4
5use crate::internal_events::{
6 DecoderBomRemoval, DecoderMalformedReplacement, EncoderUnmappableReplacement,
7};
8
9const BUFFER_SIZE: usize = 4096;
10
11const BOM_UTF8: &[u8] = b"\xef\xbb\xbf";
14const BOM_UTF8_LEN: usize = BOM_UTF8.len();
15
16pub struct Decoder {
18 buffer: [u8; BUFFER_SIZE],
19 output: BytesMut,
20 inner: encoding_rs::Decoder,
21}
22
23impl Decoder {
24 pub fn new(encoding: &'static Encoding) -> Self {
25 Self {
26 buffer: [0; BUFFER_SIZE],
27 output: BytesMut::new(),
28 inner: encoding.new_decoder_without_bom_handling(),
44 }
45 }
46
47 pub fn decode_to_utf8(&mut self, input: Bytes) -> Bytes {
48 let mut total_read_from_input = 0;
49 let mut total_had_errors = false;
50
51 loop {
52 let (result, read, written, had_errors) = self.inner.decode_to_utf8(
53 &input[total_read_from_input..],
54 &mut self.buffer,
55 false, );
57
58 total_read_from_input += read;
59 total_had_errors |= had_errors;
60
61 self.output.extend_from_slice(&self.buffer[..written]);
62
63 match result {
64 CoderResult::InputEmpty => break, CoderResult::OutputFull => (), }
67 }
68
69 if total_had_errors {
70 emit!(DecoderMalformedReplacement {
71 from_encoding: self.inner.encoding().name()
72 });
73 }
74
75 let output = self.output.split().freeze();
76
77 if output.get(..BOM_UTF8_LEN) == Some(BOM_UTF8) {
88 emit!(DecoderBomRemoval {
89 from_encoding: self.inner.encoding().name()
90 });
91 output.slice(BOM_UTF8_LEN..)
92 } else {
93 output
94 }
95 }
96}
97
98pub struct Encoder {
100 buffer: [u8; BUFFER_SIZE],
101 output: BytesMut,
102 inner: encoding_rs::Encoder,
103 utf16_encoding: Option<Utf16Encoding>,
108}
109
110#[derive(Debug, Clone, Copy)]
111enum Utf16Encoding {
112 Le, Be, }
115
116impl Encoder {
117 pub fn new(encoding: &'static Encoding) -> Self {
118 Self {
119 buffer: [0; BUFFER_SIZE],
120 output: BytesMut::new(),
121 inner: encoding.new_encoder(),
122 utf16_encoding: Self::get_utf16_encoding(encoding),
123 }
124 }
125
126 fn get_utf16_encoding(encoding: &'static Encoding) -> Option<Utf16Encoding> {
127 match encoding.name() {
128 "UTF-16LE" => Some(Utf16Encoding::Le),
129 "UTF-16BE" => Some(Utf16Encoding::Be),
130 _ => None,
131 }
132 }
133
134 fn encode_from_utf8_to_utf16(&mut self, input: &str, variant: Utf16Encoding) -> Bytes {
135 let to_bytes_func = match variant {
136 Utf16Encoding::Le => u16::to_le_bytes,
137 Utf16Encoding::Be => u16::to_be_bytes,
138 };
139
140 for utf16_value in input.encode_utf16() {
141 self.output.extend_from_slice(&to_bytes_func(utf16_value));
142 }
143
144 self.output.split().freeze()
145 }
146
147 pub fn encode_from_utf8(&mut self, input: &str) -> Bytes {
148 if let Some(variant) = self.utf16_encoding {
150 return self.encode_from_utf8_to_utf16(input, variant);
151 }
152
153 let mut total_read_from_input = 0;
154 let mut total_had_errors = false;
155
156 loop {
157 #[expect(
158 clippy::string_slice,
159 reason = "total_read_from_input is a byte offset returned by the encoder, always a char boundary"
160 )]
161 let (result, read, written, had_errors) = self.inner.encode_from_utf8(
162 &input[total_read_from_input..],
163 &mut self.buffer,
164 false, );
166
167 total_read_from_input += read;
168 total_had_errors |= had_errors;
169
170 self.output.extend_from_slice(&self.buffer[..written]);
171
172 match result {
173 CoderResult::InputEmpty => break, CoderResult::OutputFull => (), }
176 }
177
178 if total_had_errors {
179 emit!(EncoderUnmappableReplacement {
180 to_encoding: self.inner.encoding().name()
181 });
182 }
183
184 self.output.split().freeze()
185 }
186}
187
188#[cfg(test)]
189mod tests {
190 use std::char::REPLACEMENT_CHARACTER;
191
192 use bytes::Bytes;
193 use encoding_rs::{SHIFT_JIS, UTF_8, UTF_16BE, UTF_16LE};
194
195 use super::{BOM_UTF8, Decoder, Encoder};
196
197 const BOM_UTF16LE: &[u8] = b"\xff\xfe";
200
201 const fn test_data_utf16le_123() -> &'static [u8] {
203 b"1\x002\x003\x00"
204 }
205
206 const fn test_data_utf16le_crlf() -> &'static [u8] {
207 b"\r\x00\n\x00"
208 }
209
210 const fn test_data_utf16le_vector_devanagari() -> &'static [u8] {
211 b"-\tG\t\x15\tM\t\x1f\t0\t"
212 }
213
214 const fn test_data_utf16be_123() -> &'static [u8] {
216 b"\x001\x002\x003"
217 }
218
219 const fn test_data_utf16be_crlf() -> &'static [u8] {
220 b"\x00\r\x00\n"
221 }
222
223 const fn test_data_utf16be_vector_devanagari() -> &'static [u8] {
224 b"\t-\tG\t\x15\tM\t\x1f\t0"
225 }
226
227 const fn test_data_shiftjis_helloworld_japanese() -> &'static [u8] {
229 b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h"
230 }
231
232 #[test]
233 fn test_decoder_various() {
234 let mut d = Decoder::new(UTF_8);
235 assert_eq!(d.decode_to_utf8(Bytes::from("123")), Bytes::from("123"));
236 assert_eq!(d.decode_to_utf8(Bytes::from("\n")), Bytes::from("\n"));
237 assert_eq!(d.decode_to_utf8(Bytes::from("भेक्टर")), Bytes::from("भेक्टर"));
238
239 let mut d = Decoder::new(UTF_16LE);
240 assert_eq!(
241 d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
242 Bytes::from("123")
243 );
244 assert_eq!(
245 d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
246 Bytes::from("\r\n")
247 );
248 assert_eq!(
249 d.decode_to_utf8(Bytes::from(test_data_utf16le_vector_devanagari())),
250 Bytes::from("भेक्टर")
251 );
252
253 let mut d = Decoder::new(UTF_16BE);
254 assert_eq!(
255 d.decode_to_utf8(Bytes::from(test_data_utf16be_123())),
256 Bytes::from("123")
257 );
258 assert_eq!(
259 d.decode_to_utf8(Bytes::from(test_data_utf16be_crlf())),
260 Bytes::from("\r\n")
261 );
262 assert_eq!(
263 d.decode_to_utf8(Bytes::from(test_data_utf16be_vector_devanagari())),
264 Bytes::from("भेक्टर")
265 );
266
267 let mut d = Decoder::new(SHIFT_JIS);
268 assert_eq!(
269 d.decode_to_utf8(Bytes::from(test_data_shiftjis_helloworld_japanese())),
270 Bytes::from("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}")
272 );
273 }
274
275 #[test]
276 fn test_decoder_long_input() {
277 let mut d = Decoder::new(UTF_8);
278
279 let long_input = "This line is super long and will take up more space than Decoder's internal buffer, just to make sure that everything works properly when multiple inner decode calls are involved".repeat(10000);
280
281 assert_eq!(
282 d.decode_to_utf8(Bytes::from(long_input.clone())),
283 Bytes::from(long_input)
284 );
285 }
286
287 #[test]
288 fn test_decoder_replacements() {
289 let mut d = Decoder::new(UTF_8);
290
291 let problematic_input = [BOM_UTF16LE, b"123"].concat();
294
295 assert_eq!(
296 d.decode_to_utf8(Bytes::from(problematic_input)),
297 Bytes::from(format!("{REPLACEMENT_CHARACTER}{REPLACEMENT_CHARACTER}123"))
298 );
299 }
300
301 #[test]
302 fn test_decoder_bom_removal() {
303 let mut d = Decoder::new(UTF_16LE);
304
305 let input_bom_start = [BOM_UTF16LE, test_data_utf16le_123()].concat();
306
307 assert_eq!(
309 d.decode_to_utf8(Bytes::from(input_bom_start.clone())),
310 Bytes::from("123")
311 );
312
313 assert_eq!(
315 d.decode_to_utf8(Bytes::from(input_bom_start)),
316 Bytes::from("123")
317 );
318
319 assert_eq!(
321 d.decode_to_utf8(Bytes::from(
322 [
323 test_data_utf16le_123(),
324 BOM_UTF16LE,
325 test_data_utf16le_123(),
326 ]
327 .concat()
328 )),
329 Bytes::from([b"123", BOM_UTF8, b"123"].concat())
330 );
331
332 assert_eq!(
334 d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
335 Bytes::from("123")
336 );
337 assert_eq!(
338 d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
339 Bytes::from("\r\n")
340 );
341 }
342
343 #[test]
344 fn test_encoder_various() {
345 let mut d = Encoder::new(UTF_8);
346 assert_eq!(d.encode_from_utf8("123"), Bytes::from("123"));
347 assert_eq!(d.encode_from_utf8("\n"), Bytes::from("\n"));
348 assert_eq!(d.encode_from_utf8("भेक्टर"), Bytes::from("भेक्टर"));
349
350 let mut d = Encoder::new(UTF_16LE);
351 assert_eq!(
352 d.encode_from_utf8("123"),
353 Bytes::from(test_data_utf16le_123())
354 );
355 assert_eq!(
356 d.encode_from_utf8("\r\n"),
357 Bytes::from(test_data_utf16le_crlf())
358 );
359 assert_eq!(
360 d.encode_from_utf8("भेक्टर"),
361 Bytes::from(test_data_utf16le_vector_devanagari())
362 );
363
364 let mut d = Encoder::new(UTF_16BE);
365 assert_eq!(
366 d.encode_from_utf8("123"),
367 Bytes::from(test_data_utf16be_123())
368 );
369 assert_eq!(
370 d.encode_from_utf8("\r\n"),
371 Bytes::from(test_data_utf16be_crlf())
372 );
373 assert_eq!(
374 d.encode_from_utf8("भेक्टर"),
375 Bytes::from(test_data_utf16be_vector_devanagari())
376 );
377
378 let mut d = Encoder::new(SHIFT_JIS);
379 assert_eq!(
380 d.encode_from_utf8("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"),
382 Bytes::from(test_data_shiftjis_helloworld_japanese())
383 );
384 }
385
386 #[test]
387 fn test_encoder_long_input() {
388 let mut d = Encoder::new(UTF_8);
389
390 let long_input = "This line is super long and will take up more space than Encoder's internal buffer, just to make sure that everything works properly when multiple inner encode calls are involved".repeat(10000);
391
392 assert_eq!(
393 d.encode_from_utf8(long_input.as_str()),
394 Bytes::from(long_input)
395 );
396 }
397
398 #[test]
399 fn test_encoder_replacements() {
400 let mut d = Encoder::new(SHIFT_JIS);
401
402 let problematic_input = "\u{2638}123\u{262F}";
405
406 assert_eq!(
407 d.encode_from_utf8(problematic_input),
408 Bytes::from(format!("{}123{}", "☸", "☯"))
409 );
410 }
411
412 #[test]
413 fn test_transcode_symmetry() {
414 let encoding = UTF_16LE;
415 let mut encoder = Encoder::new(encoding);
416 let mut decoder = Decoder::new(encoding);
417
418 let input = "οὐροβόρος";
419
420 assert_eq!(
421 decoder.decode_to_utf8(encoder.encode_from_utf8(input)),
425 Bytes::from(input),
426 );
427 }
428}