1 //! Contains parser configuration structure.
2 use std::collections::HashMap;
3 use std::io::Read;
4 
5 use crate::reader::EventReader;
6 use crate::util::Encoding;
7 
8 /// Limits to defend from billion laughs attack
9 const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
10 const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
11 
12 /// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
13 ///
14 /// This structure contains various configuration options which affect
15 /// behavior of the parser.
16 #[derive(Clone, PartialEq, Eq, Debug)]
17 pub struct ParserConfig {
18     /// Whether or not should whitespace in textual events be removed. Default is false.
19     ///
20     /// When true, all standalone whitespace will be removed (this means no
21     /// `Whitespace` events will be emitted), and leading and trailing whitespace
22     /// from `Character` events will be deleted. If after trimming `Characters`
23     /// event will be empty, it will also be omitted from output stream. This is
24     /// possible, however, only if `whitespace_to_characters` or
25     /// `cdata_to_characters` options are set.
26     ///
27     /// This option does not affect CDATA events, unless `cdata_to_characters`
28     /// option is also set. In that case CDATA content will also be trimmed.
29     pub trim_whitespace: bool,
30 
31     /// Whether or not should whitespace be converted to characters.
32     /// Default is false.
33     ///
34     /// If true, instead of `Whitespace` events `Characters` events with the
35     /// same content will be emitted. If `trim_whitespace` is also true, these
36     /// events will be trimmed to nothing and, consequently, not emitted.
37     pub whitespace_to_characters: bool,
38 
39     /// Whether or not should CDATA be converted to characters.
40     /// Default is false.
41     ///
42     /// If true, instead of `CData` events `Characters` events with the same
43     /// content will be emitted. If `trim_whitespace` is also true, these events
44     /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
45     /// this event will be omitted from the stream.
46     pub cdata_to_characters: bool,
47 
48     /// Whether or not should comments be omitted. Default is true.
49     ///
50     /// If true, `Comment` events will not be emitted at all.
51     pub ignore_comments: bool,
52 
53     /// Whether or not should sequential `Characters` events be merged.
54     /// Default is true.
55     ///
56     /// If true, multiple sequential `Characters` events will be merged into
57     /// a single event, that is, their data will be concatenated.
58     ///
59     /// Multiple sequential `Characters` events are only possible if either
60     /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
61     /// events will always be separated by other events.
62     pub coalesce_characters: bool,
63 
64     /// A map of extra entities recognized by the parser. Default is an empty map.
65     ///
66     /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
67     /// however, it is convenient to make the parser recognize additional entities which
68     /// are also not available through the DTD definitions (especially given that at the moment
69     /// DTD parsing is not supported).
70     pub extra_entities: HashMap<String, String>,
71 
72     /// Whether or not the parser should ignore the end of stream. Default is false.
73     ///
74     /// By default the parser will either error out when it encounters a premature end of
75     /// stream or complete normally if the end of stream was expected. If you want to continue
76     /// reading from a stream whose input is supplied progressively, you can set this option to true.
77     /// In this case the parser will allow you to invoke the next() method even if a supposed end
78     /// of stream has happened.
79     ///
80     /// Note that support for this functionality is incomplete; for example, the parser will fail if
81     /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
82     pub ignore_end_of_stream: bool,
83 
84     /// Whether or not non-unicode entity references get replaced with the replacement character
85     ///
86     /// When true, any decimal or hexadecimal character reference that cannot be converted from a
87     /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
88     /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
89     pub replace_unknown_entity_references: bool,
90 
91     /// Whether or not whitespace at the root level of the document is ignored. Default is true.
92     ///
93     /// By default any whitespace that is not enclosed within at least one level of elements will be
94     /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
95     ///
96     /// **There are configuration options – see methods below**
97     pub ignore_root_level_whitespace: bool,
98 }
99 
100 impl ParserConfig {
101     /// Returns a new config with default values.
102     ///
103     /// You can tweak default values using builder-like pattern:
104     ///
105     /// ```rust
106     /// use xml::reader::ParserConfig;
107     ///
108     /// let config = ParserConfig::new()
109     ///     .trim_whitespace(true)
110     ///     .ignore_comments(true)
111     ///     .coalesce_characters(false);
112     /// ```
113     #[must_use]
114     #[inline]
new() -> ParserConfig115     pub fn new() -> ParserConfig {
116         ParserConfig {
117             trim_whitespace: false,
118             whitespace_to_characters: false,
119             cdata_to_characters: false,
120             ignore_comments: true,
121             coalesce_characters: true,
122             extra_entities: HashMap::new(),
123             ignore_end_of_stream: false,
124             replace_unknown_entity_references: false,
125             ignore_root_level_whitespace: true,
126         }
127     }
128 
129     /// Creates an XML reader with this configuration.
130     ///
131     /// This is a convenience method for configuring and creating a reader at the same time:
132     ///
133     /// ```rust
134     /// use xml::reader::ParserConfig;
135     ///
136     /// let mut source: &[u8] = b"...";
137     ///
138     /// let reader = ParserConfig::new()
139     ///     .trim_whitespace(true)
140     ///     .ignore_comments(true)
141     ///     .coalesce_characters(false)
142     ///     .create_reader(&mut source);
143     /// ```
144     ///
145     /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
146     /// this configuration object.
147     #[inline]
create_reader<R: Read>(self, source: R) -> EventReader<R>148     pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
149         EventReader::new_with_config(source, self)
150     }
151 
152     /// Adds a new entity mapping and returns an updated config object.
153     ///
154     /// This is a convenience method for adding external entities mappings to the XML parser.
155     /// An example:
156     ///
157     /// ```rust
158     /// use xml::reader::ParserConfig;
159     ///
160     /// let mut source: &[u8] = b"...";
161     ///
162     /// let reader = ParserConfig::new()
163     ///     .add_entity("nbsp", " ")
164     ///     .add_entity("copy", "©")
165     ///     .add_entity("reg", "®")
166     ///     .create_reader(&mut source);
167     /// ```
add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig168     pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
169         self.extra_entities.insert(entity.into(), value.into());
170         self
171     }
172 }
173 
174 impl Default for ParserConfig {
175     #[inline]
default() -> ParserConfig176     fn default() -> ParserConfig {
177         ParserConfig::new()
178     }
179 }
180 
181 gen_setters! { ParserConfig,
182     trim_whitespace: val bool,
183     whitespace_to_characters: val bool,
184     cdata_to_characters: val bool,
185     ignore_comments: val bool,
186     coalesce_characters: val bool,
187     ignore_end_of_stream: val bool,
188     replace_unknown_entity_references: val bool,
189     ignore_root_level_whitespace: val bool
190 }
191 
192 /// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
193 #[derive(Clone, PartialEq, Eq, Debug)]
194 #[non_exhaustive]
195 pub struct ParserConfig2 {
196     pub(crate) c: ParserConfig,
197 
198     /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
199     pub override_encoding: Option<Encoding>,
200 
201     /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
202     /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
203     pub ignore_invalid_encoding_declarations: bool,
204 
205     /// Documents with multiple root elements are ill-formed
206     pub allow_multiple_root_elements: bool,
207 
208     /// Abort if custom entities create a string longer than this
209     pub max_entity_expansion_length: usize,
210     /// Entities can expand into other entities this many times (be careful about exponential cost!)
211     pub max_entity_expansion_depth: u8,
212 
213     /// Maximum length of tag name or attribute name
214     pub max_name_length: usize,
215 
216     /// Max number of attributes per element
217     pub max_attributes: usize,
218 
219     /// Max number of bytes in each attribute
220     pub max_attribute_length: usize,
221 
222     /// Maximum length of strings reprsenting characters, comments, and processing instructions
223     pub max_data_length: usize,
224 }
225 
226 impl Default for ParserConfig2 {
default() -> Self227     fn default() -> Self {
228         ParserConfig2 {
229             c: Default::default(),
230             override_encoding: None,
231             ignore_invalid_encoding_declarations: false,
232             allow_multiple_root_elements: true,
233             max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
234             max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
235             max_attributes: 1<<16,
236             max_attribute_length: 1<<30,
237             max_data_length: 1<<30,
238             max_name_length: 1<<18,
239         }
240     }
241 }
242 
243 impl ParserConfig2 {
244     #[inline]
245     #[must_use]
new() -> Self246     pub fn new() -> Self {
247         Self::default()
248     }
249 
250     /// Read character encoding from `Content-Type` header.
251     /// Set this when parsing XML documents fetched over HTTP.
252     ///
253     /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
content_type(mut self, mime_type: &str) -> Self254     #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
255         let charset = mime_type.split_once(';')
256             .and_then(|(_, args)| args.split_once("charset"))
257             .and_then(|(_, args)| args.split_once('='));
258         if let Some((_, charset)) = charset {
259             let name = charset.trim().trim_matches('"');
260             match name.parse() {
261                 Ok(enc) => {
262                     self.override_encoding = Some(enc);
263                 },
264                 Err(_) => {},
265             }
266         }
267         self
268     }
269 
270     /// Creates an XML reader with this configuration.
271     ///
272     /// This is a convenience method for configuring and creating a reader at the same time:
273     ///
274     /// ```rust
275     /// use xml::reader::ParserConfig;
276     ///
277     /// let mut source: &[u8] = b"...";
278     ///
279     /// let reader = ParserConfig::new()
280     ///     .trim_whitespace(true)
281     ///     .ignore_comments(true)
282     ///     .coalesce_characters(false)
283     ///     .create_reader(&mut source);
284     /// ```
285     ///
286     /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
287     /// this configuration object.
288     #[inline]
create_reader<R: Read>(self, source: R) -> EventReader<R>289     pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
290         EventReader::new_with_config(source, self)
291     }
292 }
293 
294 impl From<ParserConfig> for ParserConfig2 {
295     #[inline]
from(c: ParserConfig) -> Self296     fn from(c: ParserConfig) -> Self {
297         Self {
298             c,
299             ..Default::default()
300         }
301     }
302 }
303 
304 gen_setters! { ParserConfig2,
305     /// Set if you got one in the HTTP header
306     override_encoding: val Option<Encoding>,
307     /// Allows invalid documents. There should be only a single root element in XML.
308     allow_multiple_root_elements: val bool,
309     /// Abort if custom entities create a string longer than this
310     max_entity_expansion_length: val usize,
311     /// Entities can expand into other entities this many times (be careful about exponential cost!)
312     max_entity_expansion_depth: val u8,
313     /// Max number of attributes per element
314     max_attributes: val usize,
315     /// Maximum length of tag name or attribute name
316     max_name_length: val usize,
317     /// Max number of bytes in each attribute
318     max_attribute_length: val usize,
319     /// Maximum length of strings reprsenting characters, comments, and processing instructions
320     max_data_length: val usize,
321     /// Allow `<?xml encoding="bogus"?>`
322     ignore_invalid_encoding_declarations: val bool
323 }
324 
325 gen_setters! { ParserConfig,
326     /// Set if you got one in the HTTP header (see `content_type`)
327     override_encoding: c2 Option<Encoding>,
328     /// Allow `<?xml encoding="bogus"?>`
329     ignore_invalid_encoding_declarations: c2 bool,
330     /// Allows invalid documents. There should be only a single root element in XML.
331     allow_multiple_root_elements: c2 bool,
332 
333     /// Abort if custom entities create a string longer than this
334     max_entity_expansion_length: c2 usize,
335     /// Entities can expand into other entities this many times (be careful about exponential cost!)
336     max_entity_expansion_depth: c2 u8,
337     /// Max number of attributes per element
338     max_attributes: c2 usize,
339     /// Maximum length of tag name or attribute name
340     max_name_length: c2 usize,
341     /// Max number of bytes in each attribute
342     max_attribute_length: c2 usize,
343     /// Maximum length of strings reprsenting characters, comments, and processing instructions
344     max_data_length: c2 usize,
345 
346     /// Set encoding from the MIME type. Important for HTTP compatibility.
347     content_type: c2 &str
348 }
349 
350 gen_setters! { ParserConfig2,
351     trim_whitespace: delegate bool,
352     whitespace_to_characters: delegate bool,
353     cdata_to_characters: delegate bool,
354     ignore_comments: delegate bool,
355     coalesce_characters: delegate bool,
356     ignore_end_of_stream: delegate bool,
357     replace_unknown_entity_references: delegate bool,
358     /// Whether or not whitespace at the root level of the document is ignored. Default is true.
359     ignore_root_level_whitespace: delegate bool
360 }
361 
362 #[test]
mime_parse()363 fn mime_parse() {
364     let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
365     assert_eq!(c.override_encoding, Some(Encoding::Ascii));
366 
367     let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
368     assert_eq!(c.override_encoding, Some(Encoding::Utf16));
369 }
370