1 //! Contains parser configuration structure.
2 use std::collections::HashMap;
3 use std::io::Read;
4
5 use crate::reader::EventReader;
6 use crate::util::Encoding;
7
8 /// Limits to defend from billion laughs attack
9 const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
10 const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
11
12 /// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
13 ///
14 /// This structure contains various configuration options which affect
15 /// behavior of the parser.
16 #[derive(Clone, PartialEq, Eq, Debug)]
17 pub struct ParserConfig {
18 /// Whether or not should whitespace in textual events be removed. Default is false.
19 ///
20 /// When true, all standalone whitespace will be removed (this means no
21 /// `Whitespace` events will be emitted), and leading and trailing whitespace
22 /// from `Character` events will be deleted. If after trimming `Characters`
23 /// event will be empty, it will also be omitted from output stream. This is
24 /// possible, however, only if `whitespace_to_characters` or
25 /// `cdata_to_characters` options are set.
26 ///
27 /// This option does not affect CDATA events, unless `cdata_to_characters`
28 /// option is also set. In that case CDATA content will also be trimmed.
29 pub trim_whitespace: bool,
30
31 /// Whether or not should whitespace be converted to characters.
32 /// Default is false.
33 ///
34 /// If true, instead of `Whitespace` events `Characters` events with the
35 /// same content will be emitted. If `trim_whitespace` is also true, these
36 /// events will be trimmed to nothing and, consequently, not emitted.
37 pub whitespace_to_characters: bool,
38
39 /// Whether or not should CDATA be converted to characters.
40 /// Default is false.
41 ///
42 /// If true, instead of `CData` events `Characters` events with the same
43 /// content will be emitted. If `trim_whitespace` is also true, these events
44 /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
45 /// this event will be omitted from the stream.
46 pub cdata_to_characters: bool,
47
48 /// Whether or not should comments be omitted. Default is true.
49 ///
50 /// If true, `Comment` events will not be emitted at all.
51 pub ignore_comments: bool,
52
53 /// Whether or not should sequential `Characters` events be merged.
54 /// Default is true.
55 ///
56 /// If true, multiple sequential `Characters` events will be merged into
57 /// a single event, that is, their data will be concatenated.
58 ///
59 /// Multiple sequential `Characters` events are only possible if either
60 /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
61 /// events will always be separated by other events.
62 pub coalesce_characters: bool,
63
64 /// A map of extra entities recognized by the parser. Default is an empty map.
65 ///
66 /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
67 /// however, it is convenient to make the parser recognize additional entities which
68 /// are also not available through the DTD definitions (especially given that at the moment
69 /// DTD parsing is not supported).
70 pub extra_entities: HashMap<String, String>,
71
72 /// Whether or not the parser should ignore the end of stream. Default is false.
73 ///
74 /// By default the parser will either error out when it encounters a premature end of
75 /// stream or complete normally if the end of stream was expected. If you want to continue
76 /// reading from a stream whose input is supplied progressively, you can set this option to true.
77 /// In this case the parser will allow you to invoke the next() method even if a supposed end
78 /// of stream has happened.
79 ///
80 /// Note that support for this functionality is incomplete; for example, the parser will fail if
81 /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
82 pub ignore_end_of_stream: bool,
83
84 /// Whether or not non-unicode entity references get replaced with the replacement character
85 ///
86 /// When true, any decimal or hexadecimal character reference that cannot be converted from a
87 /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
88 /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
89 pub replace_unknown_entity_references: bool,
90
91 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
92 ///
93 /// By default any whitespace that is not enclosed within at least one level of elements will be
94 /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
95 ///
96 /// **There are configuration options – see methods below**
97 pub ignore_root_level_whitespace: bool,
98 }
99
100 impl ParserConfig {
101 /// Returns a new config with default values.
102 ///
103 /// You can tweak default values using builder-like pattern:
104 ///
105 /// ```rust
106 /// use xml::reader::ParserConfig;
107 ///
108 /// let config = ParserConfig::new()
109 /// .trim_whitespace(true)
110 /// .ignore_comments(true)
111 /// .coalesce_characters(false);
112 /// ```
113 #[must_use]
114 #[inline]
new() -> ParserConfig115 pub fn new() -> ParserConfig {
116 ParserConfig {
117 trim_whitespace: false,
118 whitespace_to_characters: false,
119 cdata_to_characters: false,
120 ignore_comments: true,
121 coalesce_characters: true,
122 extra_entities: HashMap::new(),
123 ignore_end_of_stream: false,
124 replace_unknown_entity_references: false,
125 ignore_root_level_whitespace: true,
126 }
127 }
128
129 /// Creates an XML reader with this configuration.
130 ///
131 /// This is a convenience method for configuring and creating a reader at the same time:
132 ///
133 /// ```rust
134 /// use xml::reader::ParserConfig;
135 ///
136 /// let mut source: &[u8] = b"...";
137 ///
138 /// let reader = ParserConfig::new()
139 /// .trim_whitespace(true)
140 /// .ignore_comments(true)
141 /// .coalesce_characters(false)
142 /// .create_reader(&mut source);
143 /// ```
144 ///
145 /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
146 /// this configuration object.
147 #[inline]
create_reader<R: Read>(self, source: R) -> EventReader<R>148 pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
149 EventReader::new_with_config(source, self)
150 }
151
152 /// Adds a new entity mapping and returns an updated config object.
153 ///
154 /// This is a convenience method for adding external entities mappings to the XML parser.
155 /// An example:
156 ///
157 /// ```rust
158 /// use xml::reader::ParserConfig;
159 ///
160 /// let mut source: &[u8] = b"...";
161 ///
162 /// let reader = ParserConfig::new()
163 /// .add_entity("nbsp", " ")
164 /// .add_entity("copy", "©")
165 /// .add_entity("reg", "®")
166 /// .create_reader(&mut source);
167 /// ```
add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig168 pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
169 self.extra_entities.insert(entity.into(), value.into());
170 self
171 }
172 }
173
174 impl Default for ParserConfig {
175 #[inline]
default() -> ParserConfig176 fn default() -> ParserConfig {
177 ParserConfig::new()
178 }
179 }
180
181 gen_setters! { ParserConfig,
182 trim_whitespace: val bool,
183 whitespace_to_characters: val bool,
184 cdata_to_characters: val bool,
185 ignore_comments: val bool,
186 coalesce_characters: val bool,
187 ignore_end_of_stream: val bool,
188 replace_unknown_entity_references: val bool,
189 ignore_root_level_whitespace: val bool
190 }
191
192 /// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
193 #[derive(Clone, PartialEq, Eq, Debug)]
194 #[non_exhaustive]
195 pub struct ParserConfig2 {
196 pub(crate) c: ParserConfig,
197
198 /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
199 pub override_encoding: Option<Encoding>,
200
201 /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
202 /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
203 pub ignore_invalid_encoding_declarations: bool,
204
205 /// Documents with multiple root elements are ill-formed
206 pub allow_multiple_root_elements: bool,
207
208 /// Abort if custom entities create a string longer than this
209 pub max_entity_expansion_length: usize,
210 /// Entities can expand into other entities this many times (be careful about exponential cost!)
211 pub max_entity_expansion_depth: u8,
212
213 /// Maximum length of tag name or attribute name
214 pub max_name_length: usize,
215
216 /// Max number of attributes per element
217 pub max_attributes: usize,
218
219 /// Max number of bytes in each attribute
220 pub max_attribute_length: usize,
221
222 /// Maximum length of strings reprsenting characters, comments, and processing instructions
223 pub max_data_length: usize,
224 }
225
226 impl Default for ParserConfig2 {
default() -> Self227 fn default() -> Self {
228 ParserConfig2 {
229 c: Default::default(),
230 override_encoding: None,
231 ignore_invalid_encoding_declarations: false,
232 allow_multiple_root_elements: true,
233 max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
234 max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
235 max_attributes: 1<<16,
236 max_attribute_length: 1<<30,
237 max_data_length: 1<<30,
238 max_name_length: 1<<18,
239 }
240 }
241 }
242
243 impl ParserConfig2 {
244 #[inline]
245 #[must_use]
new() -> Self246 pub fn new() -> Self {
247 Self::default()
248 }
249
250 /// Read character encoding from `Content-Type` header.
251 /// Set this when parsing XML documents fetched over HTTP.
252 ///
253 /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
content_type(mut self, mime_type: &str) -> Self254 #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
255 let charset = mime_type.split_once(';')
256 .and_then(|(_, args)| args.split_once("charset"))
257 .and_then(|(_, args)| args.split_once('='));
258 if let Some((_, charset)) = charset {
259 let name = charset.trim().trim_matches('"');
260 match name.parse() {
261 Ok(enc) => {
262 self.override_encoding = Some(enc);
263 },
264 Err(_) => {},
265 }
266 }
267 self
268 }
269
270 /// Creates an XML reader with this configuration.
271 ///
272 /// This is a convenience method for configuring and creating a reader at the same time:
273 ///
274 /// ```rust
275 /// use xml::reader::ParserConfig;
276 ///
277 /// let mut source: &[u8] = b"...";
278 ///
279 /// let reader = ParserConfig::new()
280 /// .trim_whitespace(true)
281 /// .ignore_comments(true)
282 /// .coalesce_characters(false)
283 /// .create_reader(&mut source);
284 /// ```
285 ///
286 /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
287 /// this configuration object.
288 #[inline]
create_reader<R: Read>(self, source: R) -> EventReader<R>289 pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
290 EventReader::new_with_config(source, self)
291 }
292 }
293
294 impl From<ParserConfig> for ParserConfig2 {
295 #[inline]
from(c: ParserConfig) -> Self296 fn from(c: ParserConfig) -> Self {
297 Self {
298 c,
299 ..Default::default()
300 }
301 }
302 }
303
304 gen_setters! { ParserConfig2,
305 /// Set if you got one in the HTTP header
306 override_encoding: val Option<Encoding>,
307 /// Allows invalid documents. There should be only a single root element in XML.
308 allow_multiple_root_elements: val bool,
309 /// Abort if custom entities create a string longer than this
310 max_entity_expansion_length: val usize,
311 /// Entities can expand into other entities this many times (be careful about exponential cost!)
312 max_entity_expansion_depth: val u8,
313 /// Max number of attributes per element
314 max_attributes: val usize,
315 /// Maximum length of tag name or attribute name
316 max_name_length: val usize,
317 /// Max number of bytes in each attribute
318 max_attribute_length: val usize,
319 /// Maximum length of strings reprsenting characters, comments, and processing instructions
320 max_data_length: val usize,
321 /// Allow `<?xml encoding="bogus"?>`
322 ignore_invalid_encoding_declarations: val bool
323 }
324
325 gen_setters! { ParserConfig,
326 /// Set if you got one in the HTTP header (see `content_type`)
327 override_encoding: c2 Option<Encoding>,
328 /// Allow `<?xml encoding="bogus"?>`
329 ignore_invalid_encoding_declarations: c2 bool,
330 /// Allows invalid documents. There should be only a single root element in XML.
331 allow_multiple_root_elements: c2 bool,
332
333 /// Abort if custom entities create a string longer than this
334 max_entity_expansion_length: c2 usize,
335 /// Entities can expand into other entities this many times (be careful about exponential cost!)
336 max_entity_expansion_depth: c2 u8,
337 /// Max number of attributes per element
338 max_attributes: c2 usize,
339 /// Maximum length of tag name or attribute name
340 max_name_length: c2 usize,
341 /// Max number of bytes in each attribute
342 max_attribute_length: c2 usize,
343 /// Maximum length of strings reprsenting characters, comments, and processing instructions
344 max_data_length: c2 usize,
345
346 /// Set encoding from the MIME type. Important for HTTP compatibility.
347 content_type: c2 &str
348 }
349
350 gen_setters! { ParserConfig2,
351 trim_whitespace: delegate bool,
352 whitespace_to_characters: delegate bool,
353 cdata_to_characters: delegate bool,
354 ignore_comments: delegate bool,
355 coalesce_characters: delegate bool,
356 ignore_end_of_stream: delegate bool,
357 replace_unknown_entity_references: delegate bool,
358 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
359 ignore_root_level_whitespace: delegate bool
360 }
361
362 #[test]
mime_parse()363 fn mime_parse() {
364 let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
365 assert_eq!(c.override_encoding, Some(Encoding::Ascii));
366
367 let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
368 assert_eq!(c.override_encoding, Some(Encoding::Utf16));
369 }
370