vector/
encoding_transcode.rs

1#![allow(missing_docs)]
2use bytes::{Bytes, BytesMut};
3use encoding_rs::{CoderResult, Encoding};
4
5use crate::internal_events::{
6    DecoderBomRemoval, DecoderMalformedReplacement, EncoderUnmappableReplacement,
7};
8
9const BUFFER_SIZE: usize = 4096;
10
11// BOM unicode character (U+FEFF) expressed in utf-8
12// http://unicode.org/faq/utf_bom.html#bom4
13const BOM_UTF8: &[u8] = b"\xef\xbb\xbf";
14const BOM_UTF8_LEN: usize = BOM_UTF8.len();
15
16/// Helps transcoding from the specified encoding to utf8
17pub struct Decoder {
18    buffer: [u8; BUFFER_SIZE],
19    output: BytesMut,
20    inner: encoding_rs::Decoder,
21}
22
23impl Decoder {
24    pub fn new(encoding: &'static Encoding) -> Self {
25        Self {
26            buffer: [0; BUFFER_SIZE],
27            output: BytesMut::new(),
28            // We explicitly choose not to remove BOM as part of encoding_rs's
29            // decoding capabilities: the library has support for it, but it does
30            // so only for the first input provided to the decoder (basically,
31            // start of the stream), and for our usecases, we may get BOM markers
32            // in later inputs too (eg: when reading multiple files):
33            // https://docs.rs/encoding_rs/0.8.26/encoding_rs/struct.Encoding.html#method.new_decoder_with_bom_removal
34            //
35            // We can try to maintain separate decoders for each unique stream
36            // (eg: by filepath when reading multiple files), but that mandates
37            // cleanup of the initialized decoder structs/buffers when they are
38            // no longer needed (eg: when files are closed), which can get
39            // complicated. So we opt for simplicity here.
40            //
41            // BOM markers are still removed if the input starts with it:
42            // see decode_to_utf8() for the rationale/logic.
43            inner: encoding.new_decoder_without_bom_handling(),
44        }
45    }
46
47    pub fn decode_to_utf8(&mut self, input: Bytes) -> Bytes {
48        let mut total_read_from_input = 0;
49        let mut total_had_errors = false;
50
51        loop {
52            let (result, read, written, had_errors) = self.inner.decode_to_utf8(
53                &input[total_read_from_input..],
54                &mut self.buffer,
55                false, // not last (since we are processing a continuous stream)
56            );
57
58            total_read_from_input += read;
59            total_had_errors |= had_errors;
60
61            self.output.extend_from_slice(&self.buffer[..written]);
62
63            match result {
64                CoderResult::InputEmpty => break, // we have consumed all of the given input so we are done!
65                CoderResult::OutputFull => (), // continue reading from the input in the next loop iteration
66            }
67        }
68
69        if total_had_errors {
70            emit!(DecoderMalformedReplacement {
71                from_encoding: self.inner.encoding().name()
72            });
73        }
74
75        let output = self.output.split().freeze();
76
77        // All of the input (including any BOM sequences present) has been decoded
78        // to utf-8 by now so we can check to see if the output starts with utf-8
79        // BOM marker bytes and if it does, remove it for the final output.
80        //
81        // We can choose not to strip the BOM marker and keep it as is, but the
82        // presence of these extra bytes can throw off any downstream processing
83        // we do on the output, and rather than handling it specially on each
84        // processing, we handle it centrally here. Also, the BOM does not serve
85        // any more use for us, since the source encoding is already pre-identified
86        // as part of decoder initialization.
87        if output.get(..BOM_UTF8_LEN) == Some(BOM_UTF8) {
88            emit!(DecoderBomRemoval {
89                from_encoding: self.inner.encoding().name()
90            });
91            output.slice(BOM_UTF8_LEN..)
92        } else {
93            output
94        }
95    }
96}
97
98/// Helps transcoding to the specified encoding from utf8
99pub struct Encoder {
100    buffer: [u8; BUFFER_SIZE],
101    output: BytesMut,
102    inner: encoding_rs::Encoder,
103    // Useful for tracking whether the encoder's encoding is utf-16 (and when it
104    // is, its variety). Since encoding_rs does not have encoders for utf-16,
105    // this is necessary:
106    // https://docs.rs/encoding_rs/0.8.26/encoding_rs/index.html#utf-16le-utf-16be-and-unicode-encoding-schemes
107    utf16_encoding: Option<Utf16Encoding>,
108}
109
110#[derive(Debug, Clone, Copy)]
111enum Utf16Encoding {
112    Le, // little-endian
113    Be, // big-endian
114}
115
116impl Encoder {
117    pub fn new(encoding: &'static Encoding) -> Self {
118        Self {
119            buffer: [0; BUFFER_SIZE],
120            output: BytesMut::new(),
121            inner: encoding.new_encoder(),
122            utf16_encoding: Self::get_utf16_encoding(encoding),
123        }
124    }
125
126    fn get_utf16_encoding(encoding: &'static Encoding) -> Option<Utf16Encoding> {
127        match encoding.name() {
128            "UTF-16LE" => Some(Utf16Encoding::Le),
129            "UTF-16BE" => Some(Utf16Encoding::Be),
130            _ => None,
131        }
132    }
133
134    fn encode_from_utf8_to_utf16(&mut self, input: &str, variant: Utf16Encoding) -> Bytes {
135        let to_bytes_func = match variant {
136            Utf16Encoding::Le => u16::to_le_bytes,
137            Utf16Encoding::Be => u16::to_be_bytes,
138        };
139
140        for utf16_value in input.encode_utf16() {
141            self.output.extend_from_slice(&to_bytes_func(utf16_value));
142        }
143
144        self.output.split().freeze()
145    }
146
147    pub fn encode_from_utf8(&mut self, input: &str) -> Bytes {
148        // alternate logic if the encoder is for a utf-16 encoding variant
149        if let Some(variant) = self.utf16_encoding {
150            return self.encode_from_utf8_to_utf16(input, variant);
151        }
152
153        let mut total_read_from_input = 0;
154        let mut total_had_errors = false;
155
156        loop {
157            #[expect(
158                clippy::string_slice,
159                reason = "total_read_from_input is a byte offset returned by the encoder, always a char boundary"
160            )]
161            let (result, read, written, had_errors) = self.inner.encode_from_utf8(
162                &input[total_read_from_input..],
163                &mut self.buffer,
164                false, // not last (since we are processing a continuous stream)
165            );
166
167            total_read_from_input += read;
168            total_had_errors |= had_errors;
169
170            self.output.extend_from_slice(&self.buffer[..written]);
171
172            match result {
173                CoderResult::InputEmpty => break, // we have consumed all of the given input so we are done!
174                CoderResult::OutputFull => (), // continue reading from the input in the next loop iteration
175            }
176        }
177
178        if total_had_errors {
179            emit!(EncoderUnmappableReplacement {
180                to_encoding: self.inner.encoding().name()
181            });
182        }
183
184        self.output.split().freeze()
185    }
186}
187
188#[cfg(test)]
189mod tests {
190    use std::char::REPLACEMENT_CHARACTER;
191
192    use bytes::Bytes;
193    use encoding_rs::{SHIFT_JIS, UTF_8, UTF_16BE, UTF_16LE};
194
195    use super::{BOM_UTF8, Decoder, Encoder};
196
197    // BOM unicode character (U+FEFF) expressed in utf-16
198    // http://unicode.org/faq/utf_bom.html#bom4
199    const BOM_UTF16LE: &[u8] = b"\xff\xfe";
200
201    // test UTF_16LE data
202    const fn test_data_utf16le_123() -> &'static [u8] {
203        b"1\x002\x003\x00"
204    }
205
206    const fn test_data_utf16le_crlf() -> &'static [u8] {
207        b"\r\x00\n\x00"
208    }
209
210    const fn test_data_utf16le_vector_devanagari() -> &'static [u8] {
211        b"-\tG\t\x15\tM\t\x1f\t0\t"
212    }
213
214    // test UTF_16BE data
215    const fn test_data_utf16be_123() -> &'static [u8] {
216        b"\x001\x002\x003"
217    }
218
219    const fn test_data_utf16be_crlf() -> &'static [u8] {
220        b"\x00\r\x00\n"
221    }
222
223    const fn test_data_utf16be_vector_devanagari() -> &'static [u8] {
224        b"\t-\tG\t\x15\tM\t\x1f\t0"
225    }
226
227    // test SHIFT_JIS data
228    const fn test_data_shiftjis_helloworld_japanese() -> &'static [u8] {
229        b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h"
230    }
231
232    #[test]
233    fn test_decoder_various() {
234        let mut d = Decoder::new(UTF_8);
235        assert_eq!(d.decode_to_utf8(Bytes::from("123")), Bytes::from("123"));
236        assert_eq!(d.decode_to_utf8(Bytes::from("\n")), Bytes::from("\n"));
237        assert_eq!(d.decode_to_utf8(Bytes::from("भेक्टर")), Bytes::from("भेक्टर"));
238
239        let mut d = Decoder::new(UTF_16LE);
240        assert_eq!(
241            d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
242            Bytes::from("123")
243        );
244        assert_eq!(
245            d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
246            Bytes::from("\r\n")
247        );
248        assert_eq!(
249            d.decode_to_utf8(Bytes::from(test_data_utf16le_vector_devanagari())),
250            Bytes::from("भेक्टर")
251        );
252
253        let mut d = Decoder::new(UTF_16BE);
254        assert_eq!(
255            d.decode_to_utf8(Bytes::from(test_data_utf16be_123())),
256            Bytes::from("123")
257        );
258        assert_eq!(
259            d.decode_to_utf8(Bytes::from(test_data_utf16be_crlf())),
260            Bytes::from("\r\n")
261        );
262        assert_eq!(
263            d.decode_to_utf8(Bytes::from(test_data_utf16be_vector_devanagari())),
264            Bytes::from("भेक्टर")
265        );
266
267        let mut d = Decoder::new(SHIFT_JIS);
268        assert_eq!(
269            d.decode_to_utf8(Bytes::from(test_data_shiftjis_helloworld_japanese())),
270            // ハロー・ワールド
271            Bytes::from("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}")
272        );
273    }
274
275    #[test]
276    fn test_decoder_long_input() {
277        let mut d = Decoder::new(UTF_8);
278
279        let long_input = "This line is super long and will take up more space than Decoder's internal buffer, just to make sure that everything works properly when multiple inner decode calls are involved".repeat(10000);
280
281        assert_eq!(
282            d.decode_to_utf8(Bytes::from(long_input.clone())),
283            Bytes::from(long_input)
284        );
285    }
286
287    #[test]
288    fn test_decoder_replacements() {
289        let mut d = Decoder::new(UTF_8);
290
291        // utf-16le BOM contains bytes not mappable to utf-8 so we should see
292        // replacement characters in place of it
293        let problematic_input = [BOM_UTF16LE, b"123"].concat();
294
295        assert_eq!(
296            d.decode_to_utf8(Bytes::from(problematic_input)),
297            Bytes::from(format!("{REPLACEMENT_CHARACTER}{REPLACEMENT_CHARACTER}123"))
298        );
299    }
300
301    #[test]
302    fn test_decoder_bom_removal() {
303        let mut d = Decoder::new(UTF_16LE);
304
305        let input_bom_start = [BOM_UTF16LE, test_data_utf16le_123()].concat();
306
307        // starting BOM should be removed for first input
308        assert_eq!(
309            d.decode_to_utf8(Bytes::from(input_bom_start.clone())),
310            Bytes::from("123")
311        );
312
313        // starting BOM should continue to be removed for subsequent inputs
314        assert_eq!(
315            d.decode_to_utf8(Bytes::from(input_bom_start)),
316            Bytes::from("123")
317        );
318
319        // but if BOM is not at the start, it should be left untouched
320        assert_eq!(
321            d.decode_to_utf8(Bytes::from(
322                [
323                    test_data_utf16le_123(),
324                    BOM_UTF16LE,
325                    test_data_utf16le_123(),
326                ]
327                .concat()
328            )),
329            Bytes::from([b"123", BOM_UTF8, b"123"].concat())
330        );
331
332        // inputs without BOM should continue to work
333        assert_eq!(
334            d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
335            Bytes::from("123")
336        );
337        assert_eq!(
338            d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
339            Bytes::from("\r\n")
340        );
341    }
342
343    #[test]
344    fn test_encoder_various() {
345        let mut d = Encoder::new(UTF_8);
346        assert_eq!(d.encode_from_utf8("123"), Bytes::from("123"));
347        assert_eq!(d.encode_from_utf8("\n"), Bytes::from("\n"));
348        assert_eq!(d.encode_from_utf8("भेक्टर"), Bytes::from("भेक्टर"));
349
350        let mut d = Encoder::new(UTF_16LE);
351        assert_eq!(
352            d.encode_from_utf8("123"),
353            Bytes::from(test_data_utf16le_123())
354        );
355        assert_eq!(
356            d.encode_from_utf8("\r\n"),
357            Bytes::from(test_data_utf16le_crlf())
358        );
359        assert_eq!(
360            d.encode_from_utf8("भेक्टर"),
361            Bytes::from(test_data_utf16le_vector_devanagari())
362        );
363
364        let mut d = Encoder::new(UTF_16BE);
365        assert_eq!(
366            d.encode_from_utf8("123"),
367            Bytes::from(test_data_utf16be_123())
368        );
369        assert_eq!(
370            d.encode_from_utf8("\r\n"),
371            Bytes::from(test_data_utf16be_crlf())
372        );
373        assert_eq!(
374            d.encode_from_utf8("भेक्टर"),
375            Bytes::from(test_data_utf16be_vector_devanagari())
376        );
377
378        let mut d = Encoder::new(SHIFT_JIS);
379        assert_eq!(
380            // ハロー・ワールド
381            d.encode_from_utf8("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"),
382            Bytes::from(test_data_shiftjis_helloworld_japanese())
383        );
384    }
385
386    #[test]
387    fn test_encoder_long_input() {
388        let mut d = Encoder::new(UTF_8);
389
390        let long_input = "This line is super long and will take up more space than Encoder's internal buffer, just to make sure that everything works properly when multiple inner encode calls are involved".repeat(10000);
391
392        assert_eq!(
393            d.encode_from_utf8(long_input.as_str()),
394            Bytes::from(long_input)
395        );
396    }
397
398    #[test]
399    fn test_encoder_replacements() {
400        let mut d = Encoder::new(SHIFT_JIS);
401
402        // surrounding unicode characters here [☸ & ☯︎] are not mappable to
403        // shift JIS, we should see numeric character references in place of it
404        let problematic_input = "\u{2638}123\u{262F}";
405
406        assert_eq!(
407            d.encode_from_utf8(problematic_input),
408            Bytes::from(format!("{}123{}", "&#9784;", "&#9775;"))
409        );
410    }
411
412    #[test]
413    fn test_transcode_symmetry() {
414        let encoding = UTF_16LE;
415        let mut encoder = Encoder::new(encoding);
416        let mut decoder = Decoder::new(encoding);
417
418        let input = "οὐροβόρος";
419
420        assert_eq!(
421            // this should be an identity operation for our input plus the choice
422            // of encoding (no BOM bytes in the input, plus the unicode characters
423            // can be represented fully in both utf8 and utf16)
424            decoder.decode_to_utf8(encoder.encode_from_utf8(input)),
425            Bytes::from(input),
426        );
427    }
428}
vector/encoding_transcode.rs

vector/
encoding_transcode.rs