1 //! Find the optimal data mode sequence to encode a piece of data.
2 use crate::types::{Mode, Version};
3 use std::borrow::Borrow;
4 use std::slice::Iter;
5
6 //------------------------------------------------------------------------------
7 //{{{ Segment
8
9 /// A segment of data committed to an encoding mode.
10 #[derive(PartialEq, Eq, Debug, Copy, Clone)]
11 pub struct Segment {
12 /// The encoding mode of the segment of data.
13 pub mode: Mode,
14
15 /// The start index of the segment.
16 pub begin: usize,
17
18 /// The end index (exclusive) of the segment.
19 pub end: usize,
20 }
21
22 impl Segment {
23 /// Compute the number of bits (including the size of the mode indicator and
24 /// length bits) when this segment is encoded.
encoded_len(&self, version: Version) -> usize25 pub fn encoded_len(&self, version: Version) -> usize {
26 let byte_size = self.end - self.begin;
27 let chars_count = if self.mode == Mode::Kanji {
28 byte_size / 2
29 } else {
30 byte_size
31 };
32
33 let mode_bits_count = version.mode_bits_count();
34 let length_bits_count = self.mode.length_bits_count(version);
35 let data_bits_count = self.mode.data_bits_count(chars_count);
36
37 mode_bits_count + length_bits_count + data_bits_count
38 }
39
40 /// Panics if `&self` is not a valid segment.
assert_invariants(&self)41 fn assert_invariants(&self) {
42 // TODO: It would be great if it would be impossible to construct an invalid `Segment` -
43 // either by 1) making the fields private and only allowing construction via a public,
44 // preconditions-checking API, or 2) keeping the fields public but replacing `end: usize`
45 // with `len: NonZeroUsize`.
46 assert!(self.begin < self.end);
47 }
48 }
49
50 //}}}
51 //------------------------------------------------------------------------------
52 //{{{ Parser
53
54 /// This iterator is basically equivalent to
55 ///
56 /// ```ignore
57 /// data.map(|c| ExclCharSet::from_u8(*c))
58 /// .chain(Some(ExclCharSet::End).move_iter())
59 /// .enumerate()
60 /// ```
61 ///
62 /// But the type is too hard to write, thus the new type.
63 ///
64 struct EcsIter<I> {
65 base: I,
66 index: usize,
67 ended: bool,
68 }
69
70 impl<'a, I: Iterator<Item = &'a u8>> Iterator for EcsIter<I> {
71 type Item = (usize, ExclCharSet);
72
next(&mut self) -> Option<(usize, ExclCharSet)>73 fn next(&mut self) -> Option<(usize, ExclCharSet)> {
74 if self.ended {
75 return None;
76 }
77
78 match self.base.next() {
79 None => {
80 self.ended = true;
81 Some((self.index, ExclCharSet::End))
82 }
83 Some(c) => {
84 let old_index = self.index;
85 self.index += 1;
86 Some((old_index, ExclCharSet::from_u8(*c)))
87 }
88 }
89 }
90 }
91
92 /// QR code data parser to classify the input into distinct segments.
93 pub struct Parser<'a> {
94 ecs_iter: EcsIter<Iter<'a, u8>>,
95 state: State,
96 begin: usize,
97 pending_single_byte: bool,
98 }
99
100 impl<'a> Parser<'a> {
101 /// Creates a new iterator which parse the data into segments that only
102 /// contains their exclusive subsets. No optimization is done at this point.
103 ///
104 /// use qr_code::optimize::{Parser, Segment};
105 /// use qr_code::types::Mode::{Alphanumeric, Numeric, Byte};
106 ///
107 /// let parse_res = Parser::new(b"ABC123abcd").collect::<Vec<Segment>>();
108 /// assert_eq!(parse_res, vec![Segment { mode: Alphanumeric, begin: 0, end: 3 },
109 /// Segment { mode: Numeric, begin: 3, end: 6 },
110 /// Segment { mode: Byte, begin: 6, end: 10 }]);
111 ///
new(data: &[u8]) -> Parser112 pub fn new(data: &[u8]) -> Parser {
113 Parser {
114 ecs_iter: EcsIter {
115 base: data.iter(),
116 index: 0,
117 ended: false,
118 },
119 state: State::Init,
120 begin: 0,
121 pending_single_byte: false,
122 }
123 }
124 }
125
126 impl<'a> Iterator for Parser<'a> {
127 type Item = Segment;
128
next(&mut self) -> Option<Segment>129 fn next(&mut self) -> Option<Segment> {
130 if self.pending_single_byte {
131 self.pending_single_byte = false;
132 self.begin += 1;
133 return Some(Segment {
134 mode: Mode::Byte,
135 begin: self.begin - 1,
136 end: self.begin,
137 });
138 }
139
140 loop {
141 let (i, ecs) = match self.ecs_iter.next() {
142 None => return None,
143 Some(a) => a,
144 };
145 let (next_state, action) = STATE_TRANSITION[self.state as usize + ecs as usize];
146 self.state = next_state;
147
148 let old_begin = self.begin;
149 let push_mode = match action {
150 Action::Idle => continue,
151 Action::Numeric => Mode::Numeric,
152 Action::Alpha => Mode::Alphanumeric,
153 Action::Byte => Mode::Byte,
154 Action::Kanji => Mode::Kanji,
155 Action::KanjiAndSingleByte => {
156 let next_begin = i - 1;
157 if self.begin == next_begin {
158 Mode::Byte
159 } else {
160 self.pending_single_byte = true;
161 self.begin = next_begin;
162 return Some(Segment {
163 mode: Mode::Kanji,
164 begin: old_begin,
165 end: next_begin,
166 });
167 }
168 }
169 };
170
171 self.begin = i;
172 return Some(Segment {
173 mode: push_mode,
174 begin: old_begin,
175 end: i,
176 });
177 }
178 }
179 }
180
181 #[cfg(test)]
182 mod parse_tests {
183 use crate::optimize::{Parser, Segment};
184 use crate::types::Mode;
185
parse(data: &[u8]) -> Vec<Segment>186 fn parse(data: &[u8]) -> Vec<Segment> {
187 Parser::new(data).collect()
188 }
189
190 #[test]
test_parse_1()191 fn test_parse_1() {
192 let segs = parse(b"01049123451234591597033130128%10ABC123");
193 assert_eq!(
194 segs,
195 vec![
196 Segment {
197 mode: Mode::Numeric,
198 begin: 0,
199 end: 29
200 },
201 Segment {
202 mode: Mode::Alphanumeric,
203 begin: 29,
204 end: 30
205 },
206 Segment {
207 mode: Mode::Numeric,
208 begin: 30,
209 end: 32
210 },
211 Segment {
212 mode: Mode::Alphanumeric,
213 begin: 32,
214 end: 35
215 },
216 Segment {
217 mode: Mode::Numeric,
218 begin: 35,
219 end: 38
220 },
221 ]
222 );
223 }
224
225 #[test]
test_parse_shift_jis_example_1()226 fn test_parse_shift_jis_example_1() {
227 let segs = parse(b"\x82\xa0\x81\x41\x41\xb1\x81\xf0"); // "あ、AアÅ"
228 assert_eq!(
229 segs,
230 vec![
231 Segment {
232 mode: Mode::Kanji,
233 begin: 0,
234 end: 4
235 },
236 Segment {
237 mode: Mode::Alphanumeric,
238 begin: 4,
239 end: 5
240 },
241 Segment {
242 mode: Mode::Byte,
243 begin: 5,
244 end: 6
245 },
246 Segment {
247 mode: Mode::Kanji,
248 begin: 6,
249 end: 8
250 },
251 ]
252 );
253 }
254
255 #[test]
test_parse_utf_8()256 fn test_parse_utf_8() {
257 // Mojibake?
258 let segs = parse(b"\xe3\x81\x82\xe3\x80\x81A\xef\xbd\xb1\xe2\x84\xab");
259 assert_eq!(
260 segs,
261 vec![
262 Segment {
263 mode: Mode::Kanji,
264 begin: 0,
265 end: 4
266 },
267 Segment {
268 mode: Mode::Byte,
269 begin: 4,
270 end: 5
271 },
272 Segment {
273 mode: Mode::Kanji,
274 begin: 5,
275 end: 7
276 },
277 Segment {
278 mode: Mode::Byte,
279 begin: 7,
280 end: 10
281 },
282 Segment {
283 mode: Mode::Kanji,
284 begin: 10,
285 end: 12
286 },
287 Segment {
288 mode: Mode::Byte,
289 begin: 12,
290 end: 13
291 },
292 ]
293 );
294 }
295
296 #[test]
test_not_kanji_1()297 fn test_not_kanji_1() {
298 let segs = parse(b"\x81\x30");
299 assert_eq!(
300 segs,
301 vec![
302 Segment {
303 mode: Mode::Byte,
304 begin: 0,
305 end: 1
306 },
307 Segment {
308 mode: Mode::Numeric,
309 begin: 1,
310 end: 2
311 },
312 ]
313 );
314 }
315
316 #[test]
test_not_kanji_2()317 fn test_not_kanji_2() {
318 // Note that it's implementation detail that the byte seq is split into
319 // two. Perhaps adjust the test to check for this.
320 let segs = parse(b"\xeb\xc0");
321 assert_eq!(
322 segs,
323 vec![
324 Segment {
325 mode: Mode::Byte,
326 begin: 0,
327 end: 1
328 },
329 Segment {
330 mode: Mode::Byte,
331 begin: 1,
332 end: 2
333 },
334 ]
335 );
336 }
337
338 #[test]
test_not_kanji_3()339 fn test_not_kanji_3() {
340 let segs = parse(b"\x81\x7f");
341 assert_eq!(
342 segs,
343 vec![
344 Segment {
345 mode: Mode::Byte,
346 begin: 0,
347 end: 1
348 },
349 Segment {
350 mode: Mode::Byte,
351 begin: 1,
352 end: 2
353 },
354 ]
355 );
356 }
357
358 #[test]
test_not_kanji_4()359 fn test_not_kanji_4() {
360 let segs = parse(b"\x81\x40\x81");
361 assert_eq!(
362 segs,
363 vec![
364 Segment {
365 mode: Mode::Kanji,
366 begin: 0,
367 end: 2
368 },
369 Segment {
370 mode: Mode::Byte,
371 begin: 2,
372 end: 3
373 },
374 ]
375 );
376 }
377 }
378
379 /// Implementation of a shortest path algorithm in a graph where:
380 ///
381 /// * Graph nodes represent an encoding that covers an initial slice of segments and ends with a
382 /// given `Mode`. For an input of N segments, the graph will contain at most 4 * N nodes (maybe
383 /// less, because some reencodings are not possible - e.g. it is not possible to reencode a
384 /// `Byte` segment as `Numeric`).
385 /// * Graph edges connect encodings that cover N segments with encodings
386 /// that cover 1 additional segment. Because of possible `Mode` transitions
387 /// nodes can have up to 4 incoming edges and up to 4 outgoing edges.
388 ///
389 /// The algorithm follows the relaxation approach of the
390 /// https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm, but considers the nodes and edges in a
391 /// fixed order (rather than using a priority queue). This is possible because of the constrained,
392 /// structured shape of the graph we are working with.
393 mod shortest_path {
394 use super::Segment;
395 use crate::types::{Mode, Version};
396 use std::cmp::{Ordering, PartialOrd};
397
398 #[derive(Clone)]
399 struct Predecessor {
400 index_of_segment: usize,
401 mode: Mode,
402 }
403
404 #[derive(Clone)]
405 struct ShortestPath {
406 length: usize,
407 predecessor: Option<Predecessor>,
408 }
409
410 pub struct AlgorithmState<'a> {
411 /// Stores shortest paths - see `get_shortest_path` and `get_path_index` for more details.
412 paths: Vec<Option<ShortestPath>>,
413
414 /// Initial segmentation that we are trying to improve.
415 segments: &'a [Segment],
416
417 /// QR code version that determines how many bits are needed to encode a given `Segment`.
418 version: Version,
419 }
420
421 /// Finds the index into `AlgorithmState::paths` for a path that:
422 /// * Includes all the `0..=index_of_segment` segments from `AlgorithmState::segments`
423 /// (this range is inclusive on both ends).
424 /// * Encodes the last segment (i.e. the one at `index_of_segment`) using `mode`
get_path_index(index_of_segment: usize, mode: Mode) -> usize425 fn get_path_index(index_of_segment: usize, mode: Mode) -> usize {
426 index_of_segment * Mode::ALL_MODES.len() + (mode as usize)
427 }
428
429 impl<'a> AlgorithmState<'a> {
430 /// Gets the shortest path that has been computed earlier.
431 ///
432 /// * For more details about `ending_mode` and `index_of_segment` see the documentation
433 /// of `get_path_index`
434 /// * For more details about the return value see the documentation of
435 /// `compute_shortest_path`.
get_shortest_path( &self, index_of_segment: usize, ending_mode: Mode, ) -> &Option<ShortestPath>436 fn get_shortest_path(
437 &self,
438 index_of_segment: usize,
439 ending_mode: Mode,
440 ) -> &Option<ShortestPath> {
441 &self.paths[get_path_index(index_of_segment, ending_mode)]
442 }
443
get_end_of_predecessor(&self, predecessor: &Option<Predecessor>) -> usize444 fn get_end_of_predecessor(&self, predecessor: &Option<Predecessor>) -> usize {
445 match predecessor {
446 None => 0,
447 Some(predecessor) => self.segments[predecessor.index_of_segment].end,
448 }
449 }
450
451 /// Computes the shortest path that
452 /// * Includes all the segments in `self.segments[0..=index_of_segment]`
453 /// * Encodes the last segment (i.e. the one at `index_of_segment`) using `desired_mode`
454 ///
455 /// Assumes that shortest paths for all earlier segments (i.e. ones before
456 /// `index_of_segment) have already been computed and memoized in `self.paths`.
457 ///
458 /// Returns `None` if the segment at `index_of_segment` can't be encoded using
459 /// `desired_mode` (e.g. contents of `Mode::Byte` segment can't be reencoded using
460 /// `Mode::Numeric` - see also the documentation of `PartialOrd` impl for `Mode`).
compute_shortest_path( &self, index_of_segment: usize, desired_mode: Mode, ) -> Option<ShortestPath>461 fn compute_shortest_path(
462 &self,
463 index_of_segment: usize,
464 desired_mode: Mode,
465 ) -> Option<ShortestPath> {
466 let curr_segment = self.segments[index_of_segment];
467
468 // Check if `curr_segment` can be reencoded using `desired_mode`.
469 match curr_segment.mode.partial_cmp(&desired_mode) {
470 None | Some(Ordering::Greater) => return None,
471 Some(Ordering::Less) | Some(Ordering::Equal) => (),
472 }
473
474 let length_of_reencoding_curr_segment_in_desired_mode = {
475 let reencoded_segment = Segment {
476 begin: curr_segment.begin,
477 end: curr_segment.end,
478 mode: desired_mode,
479 };
480 reencoded_segment.encoded_len(self.version)
481 };
482
483 // Handle the case when there are no predecessors.
484 if index_of_segment == 0 {
485 return Some(ShortestPath {
486 length: length_of_reencoding_curr_segment_in_desired_mode,
487 predecessor: None,
488 });
489 }
490
491 // Find the predecessor with the best mode / compute the shortest path.
492 let prev_index = index_of_segment - 1;
493 Mode::ALL_MODES
494 .iter()
495 .filter_map(|&prev_mode| {
496 self.get_shortest_path(prev_index, prev_mode)
497 .as_ref()
498 .map(|prev_path| (prev_mode, prev_path))
499 })
500 .map(|(prev_mode, prev_path)| {
501 if prev_mode == desired_mode {
502 let merged_length = {
503 let merged_segment = Segment {
504 begin: self.get_end_of_predecessor(&prev_path.predecessor),
505 end: curr_segment.end,
506 mode: desired_mode,
507 };
508 merged_segment.encoded_len(self.version)
509 };
510 let length_up_to_merged_segment = match prev_path.predecessor.as_ref() {
511 None => 0,
512 Some(predecessor) => {
513 self.get_shortest_path(
514 predecessor.index_of_segment,
515 predecessor.mode,
516 )
517 .as_ref()
518 .expect("Predecessors should point at a valid path")
519 .length
520 }
521 };
522 ShortestPath {
523 length: length_up_to_merged_segment + merged_length,
524 predecessor: prev_path.predecessor.clone(),
525 }
526 } else {
527 ShortestPath {
528 length: prev_path.length
529 + length_of_reencoding_curr_segment_in_desired_mode,
530 predecessor: Some(Predecessor {
531 index_of_segment: prev_index,
532 mode: prev_mode,
533 }),
534 }
535 }
536 })
537 .min_by_key(|path| path.length)
538 }
539
540 /// Runs the shortest-path algorithm, fully populating `Self::paths`.
find_shortest_paths(segments: &'a [Segment], version: Version) -> Self541 pub fn find_shortest_paths(segments: &'a [Segment], version: Version) -> Self {
542 let mut this = AlgorithmState::<'a> {
543 segments,
544 paths: vec![None; segments.len() * Mode::ALL_MODES.len()],
545 version,
546 };
547 for index_of_segment in 0..segments.len() {
548 for &mode in Mode::ALL_MODES {
549 this.paths[get_path_index(index_of_segment, mode)] =
550 this.compute_shortest_path(index_of_segment, mode);
551 }
552 }
553 this
554 }
555
556 /// Constructs the best segmentation from prepopulated `self.paths`.
construct_best_segmentation(&self) -> Vec<Segment>557 pub fn construct_best_segmentation(&self) -> Vec<Segment> {
558 let mut result = Vec::new();
559
560 // Start at the last segment (since we want the best encoding
561 // that covers all of the input segments) and find the mode that
562 // results in the shortest path for the last segment.
563 let mut curr_index_of_segment = self.segments.len() - 1;
564 let (mut best_mode, mut shortest_path) = Mode::ALL_MODES
565 .iter()
566 .filter_map(|&mode| {
567 self.get_shortest_path(curr_index_of_segment, mode)
568 .as_ref()
569 .map(|shortest_path| (mode, shortest_path))
570 })
571 .min_by_key(|(_mode, path)| path.length)
572 .expect("At least one path should always exist");
573
574 // Work backwards to construct the shortest path based on the predecessor information.
575 loop {
576 result.push(Segment {
577 begin: self.get_end_of_predecessor(&shortest_path.predecessor),
578 end: self.segments[curr_index_of_segment].end,
579 mode: best_mode,
580 });
581 match shortest_path.predecessor.as_ref() {
582 None => {
583 result.reverse();
584 return result;
585 }
586 Some(predecessor) => {
587 curr_index_of_segment = predecessor.index_of_segment;
588 best_mode = predecessor.mode;
589 shortest_path = self
590 .get_shortest_path(curr_index_of_segment, best_mode)
591 .as_ref()
592 .expect("Predecessors should point at valid paths");
593 }
594 };
595 }
596 }
597 }
598 }
599
600 /// Panics if `segments` are not consecutive (i.e. if there are gaps, or overlaps, or if the
601 /// segments are not ordered by their `begin` field).
assert_valid_segmentation(segments: &[Segment])602 fn assert_valid_segmentation(segments: &[Segment]) {
603 if segments.is_empty() {
604 return;
605 }
606
607 for segment in segments.iter() {
608 segment.assert_invariants();
609 }
610
611 let consecutive_pairs = segments[..(segments.len() - 1)]
612 .iter()
613 .zip(segments[1..].iter());
614 for (prev, next) in consecutive_pairs {
615 assert_eq!(prev.end, next.begin, "Non-adjacent segments");
616 }
617 }
618
619 /// Optimize the segments by combining segments when possible.
620 ///
621 /// Optimization considers all possible segmentations where each of the input `segments`
622 /// may have been reencoded into a different mode (and where adjacent, same-mode segments are
623 /// merged). The shortest of the considered segmentations is returned. The implementation uses a
624 /// shortest-path algorithm that runs in O(N) time and uses additional O(N) memory.
625 ///
626 /// This function may panic if `segments` do not represent a valid segmentation
627 /// (e.g. if there are gaps between segments, if the segments overlap, etc.).
optimize_segmentation(segments: &[Segment], version: Version) -> Vec<Segment>628 pub fn optimize_segmentation(segments: &[Segment], version: Version) -> Vec<Segment> {
629 assert_valid_segmentation(segments);
630 if segments.is_empty() {
631 return Vec::new();
632 }
633
634 let optimized_segments = shortest_path::AlgorithmState::find_shortest_paths(segments, version)
635 .construct_best_segmentation();
636
637 #[cfg(fuzzing)]
638 optimize_test_helpers::assert_valid_optimization(&segments, &*optimized_segments, version);
639
640 optimized_segments
641 }
642
643 /// Computes the total encoded length of all segments.
total_encoded_len<I>(segments: I, version: Version) -> usize where I: IntoIterator, I::Item: Borrow<Segment>,644 pub fn total_encoded_len<I>(segments: I, version: Version) -> usize
645 where
646 I: IntoIterator,
647 I::Item: Borrow<Segment>,
648 {
649 segments
650 .into_iter()
651 .map(|seg| seg.borrow().encoded_len(version))
652 .sum()
653 }
654
655 #[cfg(any(fuzzing, test))]
656 mod optimize_test_helpers {
657 use super::{assert_valid_segmentation, total_encoded_len, Segment};
658 use crate::types::{Mode, Version};
659 use std::cmp::Ordering;
660
661 /// Panics if there exists an input that can be represented by `given` segments,
662 /// but that cannot be represented by `opt` segments.
assert_segmentation_equivalence(given: &[Segment], opt: &[Segment])663 pub fn assert_segmentation_equivalence(given: &[Segment], opt: &[Segment]) {
664 if given.is_empty() {
665 assert!(opt.is_empty());
666 return;
667 }
668 let begin = given.first().unwrap().begin;
669 let end = given.last().unwrap().end;
670
671 // Verify that `opt` covers the same range as `given`.
672 // (This assumes that contiguous coverage has already been verified by
673 // `assert_valid_segmentation`.)
674 assert!(!opt.is_empty());
675 assert_eq!(begin, opt.first().unwrap().begin);
676 assert_eq!(end, opt.last().unwrap().end);
677
678 // Verify that for each character, `opt` can represent all the characters that may be
679 // present in `given`.
680 for i in begin..end {
681 fn find_mode(segments: &[Segment], i: usize) {
682 segments
683 .iter()
684 .filter(|s| (s.begin <= i) && (i < s.end))
685 .next()
686 .expect("Expecting exactly one segment")
687 .mode;
688 }
689 let given_mode = find_mode(&*given, i);
690 let opt_mode = find_mode(&*given, i);
691 match given_mode.partial_cmp(&opt_mode) {
692 Some(Ordering::Less) | Some(Ordering::Equal) => (),
693 _ => panic!(
694 "Character #{} is {:?}, which {:?} may not represent",
695 i, given_mode, opt_mode,
696 ),
697 }
698 }
699 }
700
701 /// Panics if `optimized` is not an improved representation of `input`.
assert_valid_optimization(input: &[Segment], optimized: &[Segment], version: Version)702 pub fn assert_valid_optimization(input: &[Segment], optimized: &[Segment], version: Version) {
703 assert_valid_segmentation(input);
704 assert_valid_segmentation(optimized);
705 assert_segmentation_equivalence(input, optimized);
706
707 let input_len = total_encoded_len(input, version);
708 let optimized_len = total_encoded_len(optimized, version);
709 assert!(optimized_len <= input_len);
710
711 let single_bytes_segment_len = if input.is_empty() {
712 0
713 } else {
714 Segment {
715 begin: input.first().unwrap().begin,
716 end: input.last().unwrap().end,
717 mode: Mode::Byte,
718 }
719 .encoded_len(version)
720 };
721 assert!(optimized_len <= single_bytes_segment_len);
722 }
723 }
724
725 #[cfg(test)]
726 mod optimize_tests {
727 use super::optimize_test_helpers::*;
728 use super::{assert_valid_segmentation, optimize_segmentation, total_encoded_len, Segment};
729 use crate::types::{Mode, Version};
730 use std::cmp::Ordering;
731
test_optimization_result(given: Vec<Segment>, expected: Vec<Segment>, version: Version)732 fn test_optimization_result(given: Vec<Segment>, expected: Vec<Segment>, version: Version) {
733 // Verify that the test input is valid.
734 assert_valid_segmentation(&given);
735
736 // Verify that the test expectations are compatible with the test input.
737 assert_valid_segmentation(&expected);
738 assert_segmentation_equivalence(&given, &expected);
739
740 let opt_segs = optimize_segmentation(given.as_slice(), version);
741 assert_valid_optimization(&given, &opt_segs, version);
742
743 // Verify that optimization produces result that is as short as `expected`.
744 if opt_segs != expected {
745 let actual_len = total_encoded_len(&opt_segs, version);
746 let expected_len = total_encoded_len(&expected, version);
747 let msg = match actual_len.cmp(&expected_len) {
748 Ordering::Less => "Optimization gave something better than expected",
749 Ordering::Equal => "Optimization gave something different, but just as short",
750 Ordering::Greater => "Optimization gave something worse than expected",
751 };
752 panic!(
753 "{}: expected_len={}; actual_len={}; opt_segs=({:?})",
754 msg, expected_len, actual_len, opt_segs
755 );
756 }
757 }
758
759 #[test]
test_example_1()760 fn test_example_1() {
761 test_optimization_result(
762 vec![
763 Segment {
764 mode: Mode::Alphanumeric,
765 begin: 0,
766 end: 3,
767 },
768 Segment {
769 mode: Mode::Numeric,
770 begin: 3,
771 end: 6,
772 },
773 Segment {
774 mode: Mode::Byte,
775 begin: 6,
776 end: 10,
777 },
778 ],
779 vec![
780 Segment {
781 mode: Mode::Alphanumeric,
782 begin: 0,
783 end: 6,
784 },
785 Segment {
786 mode: Mode::Byte,
787 begin: 6,
788 end: 10,
789 },
790 ],
791 Version::Normal(1),
792 );
793 }
794
795 #[test]
test_example_2()796 fn test_example_2() {
797 test_optimization_result(
798 vec![
799 Segment {
800 mode: Mode::Numeric,
801 begin: 0,
802 end: 29,
803 },
804 Segment {
805 mode: Mode::Alphanumeric,
806 begin: 29,
807 end: 30,
808 },
809 Segment {
810 mode: Mode::Numeric,
811 begin: 30,
812 end: 32,
813 },
814 Segment {
815 mode: Mode::Alphanumeric,
816 begin: 32,
817 end: 35,
818 },
819 Segment {
820 mode: Mode::Numeric,
821 begin: 35,
822 end: 38,
823 },
824 ],
825 vec![
826 Segment {
827 mode: Mode::Numeric,
828 begin: 0,
829 end: 29,
830 },
831 Segment {
832 mode: Mode::Alphanumeric,
833 begin: 29,
834 end: 38,
835 },
836 ],
837 Version::Normal(9),
838 );
839 }
840
841 #[test]
test_example_3()842 fn test_example_3() {
843 test_optimization_result(
844 vec![
845 Segment {
846 mode: Mode::Kanji,
847 begin: 0,
848 end: 4,
849 },
850 Segment {
851 mode: Mode::Alphanumeric,
852 begin: 4,
853 end: 5,
854 },
855 Segment {
856 mode: Mode::Byte,
857 begin: 5,
858 end: 6,
859 },
860 Segment {
861 mode: Mode::Kanji,
862 begin: 6,
863 end: 8,
864 },
865 ],
866 vec![Segment {
867 mode: Mode::Byte,
868 begin: 0,
869 end: 8,
870 }],
871 Version::Normal(1),
872 );
873 }
874
875 #[test]
test_example_4()876 fn test_example_4() {
877 test_optimization_result(
878 vec![
879 Segment {
880 mode: Mode::Kanji,
881 begin: 0,
882 end: 10,
883 },
884 Segment {
885 mode: Mode::Byte,
886 begin: 10,
887 end: 11,
888 },
889 ],
890 vec![
891 Segment {
892 mode: Mode::Kanji,
893 begin: 0,
894 end: 10,
895 },
896 Segment {
897 mode: Mode::Byte,
898 begin: 10,
899 end: 11,
900 },
901 ],
902 Version::Normal(1),
903 );
904 }
905
906 #[test]
test_empty()907 fn test_empty() {
908 test_optimization_result(vec![], vec![], Version::Normal(1));
909 }
910
911 /// This example shows that greedy merging strategy (used by `qr_code` v1.1.0) can give
912 /// suboptimal results for the following input: `"bytesbytesA1B2C3D4E5"`. The greedy strategy
913 /// will always consider it (locally) beneficial to merge the initial `Mode::Byte` segment with
914 /// the subsequent single-char segment - this will result in representing the whole input
915 /// in a single `Mode::Byte` segment. Better segmentation can be done by first merging all the
916 /// `Mode::Alphanumeric` and `Mode::Numeric` segments.
917 #[test]
test_example_where_greedy_merging_is_suboptimal()918 fn test_example_where_greedy_merging_is_suboptimal() {
919 let mut input = vec![Segment {
920 mode: Mode::Byte,
921 begin: 0,
922 end: 10,
923 }];
924 for _ in 0..5 {
925 for &mode in [Mode::Alphanumeric, Mode::Numeric].iter() {
926 let begin = input.last().unwrap().end;
927 let end = begin + 1;
928 input.push(Segment { mode, begin, end });
929 }
930 }
931 test_optimization_result(
932 input,
933 vec![
934 Segment {
935 mode: Mode::Byte,
936 begin: 0,
937 end: 10,
938 },
939 Segment {
940 mode: Mode::Alphanumeric,
941 begin: 10,
942 end: 20,
943 },
944 ],
945 Version::Normal(40),
946 );
947 }
948
949 /// In this example merging 2 consecutive segments is always harmful, but merging many segments
950 /// may be beneficial. This is because merging more than 2 segments saves additional header
951 /// bytes.
952 #[test]
test_example_where_merging_two_consecutive_segments_is_always_harmful()953 fn test_example_where_merging_two_consecutive_segments_is_always_harmful() {
954 let mut input = vec![];
955 for _ in 0..5 {
956 for &mode in [Mode::Byte, Mode::Alphanumeric].iter() {
957 let begin = input.last().map(|s: &Segment| s.end).unwrap_or_default();
958 let end = begin + 10;
959 input.push(Segment { mode, begin, end });
960 }
961 }
962 test_optimization_result(
963 input,
964 vec![
965 Segment {
966 mode: Mode::Byte,
967 begin: 0,
968 end: 90,
969 },
970 Segment {
971 mode: Mode::Alphanumeric,
972 begin: 90,
973 end: 100,
974 },
975 ],
976 Version::Normal(40),
977 );
978 }
979
980 #[test]
test_optimize_base64()981 fn test_optimize_base64() {
982 let input: &[u8] = include_bytes!("../test_data/large_base64.in");
983 let input: Vec<Segment> = super::Parser::new(input).collect();
984 test_optimization_result(
985 input,
986 vec![
987 Segment {
988 mode: Mode::Byte,
989 begin: 0,
990 end: 334,
991 },
992 Segment {
993 mode: Mode::Alphanumeric,
994 begin: 334,
995 end: 349,
996 },
997 Segment {
998 mode: Mode::Byte,
999 begin: 349,
1000 end: 387,
1001 },
1002 Segment {
1003 mode: Mode::Alphanumeric,
1004 begin: 387,
1005 end: 402,
1006 },
1007 Segment {
1008 mode: Mode::Byte,
1009 begin: 402,
1010 end: 850,
1011 },
1012 Segment {
1013 mode: Mode::Alphanumeric,
1014 begin: 850,
1015 end: 866,
1016 },
1017 Segment {
1018 mode: Mode::Byte,
1019 begin: 866,
1020 end: 1146,
1021 },
1022 Segment {
1023 mode: Mode::Alphanumeric,
1024 begin: 1146,
1025 end: 1162,
1026 },
1027 Segment {
1028 mode: Mode::Byte,
1029 begin: 1162,
1030 end: 2474,
1031 },
1032 Segment {
1033 mode: Mode::Alphanumeric,
1034 begin: 2474,
1035 end: 2489,
1036 },
1037 Segment {
1038 mode: Mode::Byte,
1039 begin: 2489,
1040 end: 2618,
1041 },
1042 Segment {
1043 mode: Mode::Alphanumeric,
1044 begin: 2618,
1045 end: 2641,
1046 },
1047 Segment {
1048 mode: Mode::Byte,
1049 begin: 2641,
1050 end: 2707,
1051 },
1052 Segment {
1053 mode: Mode::Alphanumeric,
1054 begin: 2707,
1055 end: 2722,
1056 },
1057 Segment {
1058 mode: Mode::Byte,
1059 begin: 2722,
1060 end: 2880,
1061 },
1062 ],
1063 Version::Normal(40),
1064 );
1065 }
1066
1067 #[test]
test_annex_j_guideline_1a()1068 fn test_annex_j_guideline_1a() {
1069 test_optimization_result(
1070 vec![
1071 Segment {
1072 mode: Mode::Numeric,
1073 begin: 0,
1074 end: 3,
1075 },
1076 Segment {
1077 mode: Mode::Alphanumeric,
1078 begin: 3,
1079 end: 4,
1080 },
1081 ],
1082 vec![
1083 Segment {
1084 mode: Mode::Numeric,
1085 begin: 0,
1086 end: 3,
1087 },
1088 Segment {
1089 mode: Mode::Alphanumeric,
1090 begin: 3,
1091 end: 4,
1092 },
1093 ],
1094 Version::Micro(2),
1095 );
1096 }
1097
1098 #[test]
test_annex_j_guideline_1b()1099 fn test_annex_j_guideline_1b() {
1100 test_optimization_result(
1101 vec![
1102 Segment {
1103 mode: Mode::Numeric,
1104 begin: 0,
1105 end: 2,
1106 },
1107 Segment {
1108 mode: Mode::Alphanumeric,
1109 begin: 2,
1110 end: 4,
1111 },
1112 ],
1113 vec![Segment {
1114 mode: Mode::Alphanumeric,
1115 begin: 0,
1116 end: 4,
1117 }],
1118 Version::Micro(2),
1119 );
1120 }
1121
1122 #[test]
test_annex_j_guideline_1c()1123 fn test_annex_j_guideline_1c() {
1124 test_optimization_result(
1125 vec![
1126 Segment {
1127 mode: Mode::Numeric,
1128 begin: 0,
1129 end: 3,
1130 },
1131 Segment {
1132 mode: Mode::Alphanumeric,
1133 begin: 3,
1134 end: 4,
1135 },
1136 ],
1137 vec![Segment {
1138 mode: Mode::Alphanumeric,
1139 begin: 0,
1140 end: 4,
1141 }],
1142 Version::Micro(3),
1143 );
1144 }
1145 }
1146
1147 #[cfg(bench)]
1148 mod bench {
1149 use super::{optimize_segmentation, total_encoded_len, Parser, Segment};
1150 use crate::Version;
1151
bench_optimize(data: &[u8], version: Version, bencher: &mut test::Bencher)1152 fn bench_optimize(data: &[u8], version: Version, bencher: &mut test::Bencher) {
1153 bencher.iter(|| {
1154 let segments: Vec<Segment> = Parser::new(data).collect();
1155 let segments = optimize_segmentation(segments.as_slice(), version);
1156 test::black_box(total_encoded_len(segments, version))
1157 });
1158 }
1159
1160 #[bench]
bench_optimize_example1(bencher: &mut test::Bencher)1161 fn bench_optimize_example1(bencher: &mut test::Bencher) {
1162 let data = b"QR\x83R\x81[\x83h\x81i\x83L\x83\x85\x81[\x83A\x81[\x83\x8b\x83R\x81[\x83h\x81j\
1163 \x82\xc6\x82\xcd\x81A1994\x94N\x82\xc9\x83f\x83\x93\x83\\\x81[\x82\xcc\x8aJ\
1164 \x94\xad\x95\x94\x96\xe5\x81i\x8c\xbb\x8d\xdd\x82\xcd\x95\xaa\x97\xa3\x82\xb5\x83f\
1165 \x83\x93\x83\\\x81[\x83E\x83F\x81[\x83u\x81j\x82\xaa\x8aJ\x94\xad\x82\xb5\x82\xbd\
1166 \x83}\x83g\x83\x8a\x83b\x83N\x83X\x8c^\x93\xf1\x8e\x9f\x8c\xb3\x83R\x81[\x83h\
1167 \x82\xc5\x82\xa0\x82\xe9\x81B\x82\xc8\x82\xa8\x81AQR\x83R\x81[\x83h\x82\xc6\
1168 \x82\xa2\x82\xa4\x96\xbc\x8f\xcc\x81i\x82\xa8\x82\xe6\x82\xd1\x92P\x8c\xea\x81j\
1169 \x82\xcd\x83f\x83\x93\x83\\\x81[\x83E\x83F\x81[\x83u\x82\xcc\x93o\x98^\x8f\xa4\
1170 \x95W\x81i\x91\xe64075066\x8d\x86\x81j\x82\xc5\x82\xa0\x82\xe9\x81BQR\x82\xcd\
1171 Quick Response\x82\xc9\x97R\x97\x88\x82\xb5\x81A\x8d\x82\x91\xac\x93\xc7\x82\xdd\
1172 \x8e\xe6\x82\xe8\x82\xaa\x82\xc5\x82\xab\x82\xe9\x82\xe6\x82\xa4\x82\xc9\x8aJ\
1173 \x94\xad\x82\xb3\x82\xea\x82\xbd\x81B\x93\x96\x8f\x89\x82\xcd\x8e\xa9\x93\xae\
1174 \x8e\xd4\x95\x94\x95i\x8dH\x8f\xea\x82\xe2\x94z\x91\x97\x83Z\x83\x93\x83^\x81[\
1175 \x82\xc8\x82\xc7\x82\xc5\x82\xcc\x8eg\x97p\x82\xf0\x94O\x93\xaa\x82\xc9\x8aJ\
1176 \x94\xad\x82\xb3\x82\xea\x82\xbd\x82\xaa\x81A\x8c\xbb\x8d\xdd\x82\xc5\x82\xcd\x83X\
1177 \x83}\x81[\x83g\x83t\x83H\x83\x93\x82\xcc\x95\x81\x8by\x82\xc8\x82\xc7\x82\xc9\
1178 \x82\xe6\x82\xe8\x93\xfa\x96{\x82\xc9\x8c\xc0\x82\xe7\x82\xb8\x90\xa2\x8aE\x93I\
1179 \x82\xc9\x95\x81\x8by\x82\xb5\x82\xc4\x82\xa2\x82\xe9\x81B";
1180 bench_optimize(data, Version::Normal(15), bencher);
1181 }
1182
1183 #[bench]
bench_optimize_base64(bencher: &mut test::Bencher)1184 fn bench_optimize_base64(bencher: &mut test::Bencher) {
1185 let data = include_bytes!("../test_data/large_base64.in");
1186 bench_optimize(data, Version::Normal(40), bencher);
1187 }
1188 }
1189
1190 //}}}
1191 //------------------------------------------------------------------------------
1192 //{{{ Internal types and data for parsing
1193
1194 /// All values of `u8` can be split into 9 different character sets when
1195 /// determining which encoding to use. This enum represents these groupings for
1196 /// parsing purpose.
1197 #[derive(Copy, Clone)]
1198 enum ExclCharSet {
1199 /// The end of string.
1200 End = 0,
1201
1202 /// All symbols supported by the Alphanumeric encoding, i.e. space, `$`, `%`,
1203 /// `*`, `+`, `-`, `.`, `/` and `:`.
1204 Symbol = 1,
1205
1206 /// All numbers (0–9).
1207 Numeric = 2,
1208
1209 /// All uppercase letters (A–Z). These characters may also appear in the
1210 /// second byte of a Shift JIS 2-byte encoding.
1211 Alpha = 3,
1212
1213 /// The first byte of a Shift JIS 2-byte encoding, in the range 0x81–0x9f.
1214 KanjiHi1 = 4,
1215
1216 /// The first byte of a Shift JIS 2-byte encoding, in the range 0xe0–0xea.
1217 KanjiHi2 = 5,
1218
1219 /// The first byte of a Shift JIS 2-byte encoding, of value 0xeb. This is
1220 /// different from the other two range that the second byte has a smaller
1221 /// range.
1222 KanjiHi3 = 6,
1223
1224 /// The second byte of a Shift JIS 2-byte encoding, in the range 0x40–0xbf,
1225 /// excluding letters (covered by `Alpha`), 0x81–0x9f (covered by `KanjiHi1`),
1226 /// and the invalid byte 0x7f.
1227 KanjiLo1 = 7,
1228
1229 /// The second byte of a Shift JIS 2-byte encoding, in the range 0xc0–0xfc,
1230 /// excluding the range 0xe0–0xeb (covered by `KanjiHi2` and `KanjiHi3`).
1231 /// This half of byte-pair cannot appear as the second byte leaded by
1232 /// `KanjiHi3`.
1233 KanjiLo2 = 8,
1234
1235 /// Any other values not covered by the above character sets.
1236 Byte = 9,
1237 }
1238
1239 impl ExclCharSet {
1240 /// Determines which character set a byte is in.
from_u8(c: u8) -> Self1241 fn from_u8(c: u8) -> Self {
1242 match c {
1243 0x20 | 0x24 | 0x25 | 0x2a | 0x2b | 0x2d..=0x2f | 0x3a => ExclCharSet::Symbol,
1244 0x30..=0x39 => ExclCharSet::Numeric,
1245 0x41..=0x5a => ExclCharSet::Alpha,
1246 0x81..=0x9f => ExclCharSet::KanjiHi1,
1247 0xe0..=0xea => ExclCharSet::KanjiHi2,
1248 0xeb => ExclCharSet::KanjiHi3,
1249 0x40 | 0x5b..=0x7e | 0x80 | 0xa0..=0xbf => ExclCharSet::KanjiLo1,
1250 0xc0..=0xdf | 0xec..=0xfc => ExclCharSet::KanjiLo2,
1251 _ => ExclCharSet::Byte,
1252 }
1253 }
1254 }
1255
1256 /// The current parsing state.
1257 #[derive(Copy, Clone)]
1258 enum State {
1259 /// Just initialized.
1260 Init = 0,
1261
1262 /// Inside a string that can be exclusively encoded as Numeric.
1263 Numeric = 10,
1264
1265 /// Inside a string that can be exclusively encoded as Alphanumeric.
1266 Alpha = 20,
1267
1268 /// Inside a string that can be exclusively encoded as 8-Bit Byte.
1269 Byte = 30,
1270
1271 /// Just encountered the first byte of a Shift JIS 2-byte sequence of the
1272 /// set `KanjiHi1` or `KanjiHi2`.
1273 KanjiHi12 = 40,
1274
1275 /// Just encountered the first byte of a Shift JIS 2-byte sequence of the
1276 /// set `KanjiHi3`.
1277 KanjiHi3 = 50,
1278
1279 /// Inside a string that can be exclusively encoded as Kanji.
1280 Kanji = 60,
1281 }
1282
1283 /// What should the parser do after a state transition.
1284 #[derive(Copy, Clone)]
1285 enum Action {
1286 /// The parser should do nothing.
1287 Idle,
1288
1289 /// Push the current segment as a Numeric string, and reset the marks.
1290 Numeric,
1291
1292 /// Push the current segment as an Alphanumeric string, and reset the marks.
1293 Alpha,
1294
1295 /// Push the current segment as a 8-Bit Byte string, and reset the marks.
1296 Byte,
1297
1298 /// Push the current segment as a Kanji string, and reset the marks.
1299 Kanji,
1300
1301 /// Push the current segment excluding the last byte as a Kanji string, then
1302 /// push the remaining single byte as a Byte string, and reset the marks.
1303 KanjiAndSingleByte,
1304 }
1305
1306 static STATE_TRANSITION: [(State, Action); 70] = [
1307 // STATE_TRANSITION[current_state + next_character] == (next_state, what_to_do)
1308
1309 // Init state:
1310 (State::Init, Action::Idle), // End
1311 (State::Alpha, Action::Idle), // Symbol
1312 (State::Numeric, Action::Idle), // Numeric
1313 (State::Alpha, Action::Idle), // Alpha
1314 (State::KanjiHi12, Action::Idle), // KanjiHi1
1315 (State::KanjiHi12, Action::Idle), // KanjiHi2
1316 (State::KanjiHi3, Action::Idle), // KanjiHi3
1317 (State::Byte, Action::Idle), // KanjiLo1
1318 (State::Byte, Action::Idle), // KanjiLo2
1319 (State::Byte, Action::Idle), // Byte
1320 // Numeric state:
1321 (State::Init, Action::Numeric), // End
1322 (State::Alpha, Action::Numeric), // Symbol
1323 (State::Numeric, Action::Idle), // Numeric
1324 (State::Alpha, Action::Numeric), // Alpha
1325 (State::KanjiHi12, Action::Numeric), // KanjiHi1
1326 (State::KanjiHi12, Action::Numeric), // KanjiHi2
1327 (State::KanjiHi3, Action::Numeric), // KanjiHi3
1328 (State::Byte, Action::Numeric), // KanjiLo1
1329 (State::Byte, Action::Numeric), // KanjiLo2
1330 (State::Byte, Action::Numeric), // Byte
1331 // Alpha state:
1332 (State::Init, Action::Alpha), // End
1333 (State::Alpha, Action::Idle), // Symbol
1334 (State::Numeric, Action::Alpha), // Numeric
1335 (State::Alpha, Action::Idle), // Alpha
1336 (State::KanjiHi12, Action::Alpha), // KanjiHi1
1337 (State::KanjiHi12, Action::Alpha), // KanjiHi2
1338 (State::KanjiHi3, Action::Alpha), // KanjiHi3
1339 (State::Byte, Action::Alpha), // KanjiLo1
1340 (State::Byte, Action::Alpha), // KanjiLo2
1341 (State::Byte, Action::Alpha), // Byte
1342 // Byte state:
1343 (State::Init, Action::Byte), // End
1344 (State::Alpha, Action::Byte), // Symbol
1345 (State::Numeric, Action::Byte), // Numeric
1346 (State::Alpha, Action::Byte), // Alpha
1347 (State::KanjiHi12, Action::Byte), // KanjiHi1
1348 (State::KanjiHi12, Action::Byte), // KanjiHi2
1349 (State::KanjiHi3, Action::Byte), // KanjiHi3
1350 (State::Byte, Action::Idle), // KanjiLo1
1351 (State::Byte, Action::Idle), // KanjiLo2
1352 (State::Byte, Action::Idle), // Byte
1353 // KanjiHi12 state:
1354 (State::Init, Action::KanjiAndSingleByte), // End
1355 (State::Alpha, Action::KanjiAndSingleByte), // Symbol
1356 (State::Numeric, Action::KanjiAndSingleByte), // Numeric
1357 (State::Kanji, Action::Idle), // Alpha
1358 (State::Kanji, Action::Idle), // KanjiHi1
1359 (State::Kanji, Action::Idle), // KanjiHi2
1360 (State::Kanji, Action::Idle), // KanjiHi3
1361 (State::Kanji, Action::Idle), // KanjiLo1
1362 (State::Kanji, Action::Idle), // KanjiLo2
1363 (State::Byte, Action::KanjiAndSingleByte), // Byte
1364 // KanjiHi3 state:
1365 (State::Init, Action::KanjiAndSingleByte), // End
1366 (State::Alpha, Action::KanjiAndSingleByte), // Symbol
1367 (State::Numeric, Action::KanjiAndSingleByte), // Numeric
1368 (State::Kanji, Action::Idle), // Alpha
1369 (State::Kanji, Action::Idle), // KanjiHi1
1370 (State::KanjiHi12, Action::KanjiAndSingleByte), // KanjiHi2
1371 (State::KanjiHi3, Action::KanjiAndSingleByte), // KanjiHi3
1372 (State::Kanji, Action::Idle), // KanjiLo1
1373 (State::Byte, Action::KanjiAndSingleByte), // KanjiLo2
1374 (State::Byte, Action::KanjiAndSingleByte), // Byte
1375 // Kanji state:
1376 (State::Init, Action::Kanji), // End
1377 (State::Alpha, Action::Kanji), // Symbol
1378 (State::Numeric, Action::Kanji), // Numeric
1379 (State::Alpha, Action::Kanji), // Alpha
1380 (State::KanjiHi12, Action::Idle), // KanjiHi1
1381 (State::KanjiHi12, Action::Idle), // KanjiHi2
1382 (State::KanjiHi3, Action::Idle), // KanjiHi3
1383 (State::Byte, Action::Kanji), // KanjiLo1
1384 (State::Byte, Action::Kanji), // KanjiLo2
1385 (State::Byte, Action::Kanji), // Byte
1386 ];
1387
1388 //}}}
1389