xref: /aosp_15_r20/external/cronet/third_party/rust/chromium_crates_io/vendor/fend-core-1.4.6/src/lexer.rs (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 use crate::date::Date;
2 use crate::error::{FendError, Interrupt};
3 use crate::ident::Ident;
4 use crate::num::{Base, Number};
5 use crate::result::FResult;
6 use std::{borrow, convert, fmt};
7 
8 #[derive(Clone, Debug)]
9 pub(crate) enum Token {
10 	Num(Number),
11 	Ident(Ident),
12 	Symbol(Symbol),
13 	StringLiteral(borrow::Cow<'static, str>),
14 	Date(Date),
15 }
16 
17 #[derive(PartialEq, Eq, Copy, Clone, Debug)]
18 pub(crate) enum Symbol {
19 	OpenParens,
20 	CloseParens,
21 	Add,
22 	Sub,
23 	Mul,
24 	Div,
25 	Mod,
26 	Pow,
27 	BitwiseAnd,
28 	BitwiseOr,
29 	BitwiseXor,
30 	UnitConversion,
31 	Factorial,
32 	Fn,
33 	Backslash,
34 	Dot,
35 	Of,
36 	ShiftLeft,
37 	ShiftRight,
38 	Semicolon,
39 	Equals,       // used for assignment
40 	DoubleEquals, // used for equality
41 	NotEquals,
42 	Combination,
43 	Permutation,
44 }
45 
46 impl fmt::Display for Symbol {
fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error>47 	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
48 		let s = match self {
49 			Self::OpenParens => "(",
50 			Self::CloseParens => ")",
51 			Self::Add => "+",
52 			Self::Sub => "-",
53 			Self::Mul => "*",
54 			Self::Div => "/",
55 			Self::Mod => "mod",
56 			Self::Pow => "^",
57 			Self::BitwiseAnd => "&",
58 			Self::BitwiseOr => "|",
59 			Self::BitwiseXor => " xor ",
60 			Self::UnitConversion => "to",
61 			Self::Factorial => "!",
62 			Self::Fn => ":",
63 			Self::Backslash => "\"",
64 			Self::Dot => ".",
65 			Self::Of => "of",
66 			Self::ShiftLeft => "<<",
67 			Self::ShiftRight => ">>",
68 			Self::Semicolon => ";",
69 			Self::Equals => "=",
70 			Self::DoubleEquals => "==",
71 			Self::NotEquals => "!=",
72 			Self::Combination => "nCr",
73 			Self::Permutation => "nPr",
74 		};
75 		write!(f, "{s}")?;
76 		Ok(())
77 	}
78 }
79 
parse_char(input: &str) -> FResult<(char, &str)>80 fn parse_char(input: &str) -> FResult<(char, &str)> {
81 	input
82 		.chars()
83 		.next()
84 		.map_or(Err(FendError::ExpectedACharacter), |ch| {
85 			let (_, b) = input.split_at(ch.len_utf8());
86 			Ok((ch, b))
87 		})
88 }
89 
parse_ascii_digit(input: &str, base: Base) -> FResult<(u8, &str)>90 fn parse_ascii_digit(input: &str, base: Base) -> FResult<(u8, &str)> {
91 	let (ch, input) = parse_char(input)?;
92 	let possible_digit = ch.to_digit(base.base_as_u8().into());
93 	possible_digit
94 		.and_then(|d| <u32 as convert::TryInto<u8>>::try_into(d).ok())
95 		.map_or(Err(FendError::ExpectedADigit(ch)), |digit| {
96 			Ok((digit, input))
97 		})
98 }
99 
parse_fixed_char(input: &str, ch: char) -> FResult<((), &str)>100 fn parse_fixed_char(input: &str, ch: char) -> FResult<((), &str)> {
101 	let (parsed_ch, input) = parse_char(input)?;
102 	if parsed_ch == ch {
103 		Ok(((), input))
104 	} else {
105 		Err(FendError::ExpectedChar(ch, parsed_ch))
106 	}
107 }
108 
parse_digit_separator(input: &str) -> FResult<((), &str)>109 fn parse_digit_separator(input: &str) -> FResult<((), &str)> {
110 	let (parsed_ch, input) = parse_char(input)?;
111 	if parsed_ch == '_' || parsed_ch == ',' {
112 		Ok(((), input))
113 	} else {
114 		Err(FendError::ExpectedDigitSeparator(parsed_ch))
115 	}
116 }
117 
118 // Parses a plain integer with no whitespace and no base prefix.
119 // Leading minus sign is not allowed.
parse_integer<'a, E: From<FendError>>( input: &'a str, allow_digit_separator: bool, base: Base, process_digit: &mut impl FnMut(u8) -> Result<(), E>, ) -> Result<((), &'a str), E>120 fn parse_integer<'a, E: From<FendError>>(
121 	input: &'a str,
122 	allow_digit_separator: bool,
123 	base: Base,
124 	process_digit: &mut impl FnMut(u8) -> Result<(), E>,
125 ) -> Result<((), &'a str), E> {
126 	let (digit, mut input) = parse_ascii_digit(input, base)?;
127 	process_digit(digit)?;
128 	let mut parsed_digit_separator;
129 	loop {
130 		if let Ok(((), remaining)) = parse_digit_separator(input) {
131 			input = remaining;
132 			parsed_digit_separator = true;
133 			if !allow_digit_separator {
134 				return Err(FendError::DigitSeparatorsNotAllowed.into());
135 			}
136 		} else {
137 			parsed_digit_separator = false;
138 		}
139 		match parse_ascii_digit(input, base) {
140 			Err(_) => {
141 				if parsed_digit_separator {
142 					return Err(FendError::DigitSeparatorsOnlyBetweenDigits.into());
143 				}
144 				break;
145 			}
146 			Ok((digit, next_input)) => {
147 				process_digit(digit)?;
148 				input = next_input;
149 			}
150 		}
151 	}
152 	Ok(((), input))
153 }
154 
parse_base_prefix(input: &str) -> FResult<(Base, &str)>155 fn parse_base_prefix(input: &str) -> FResult<(Base, &str)> {
156 	// 0x -> 16
157 	// 0o -> 8
158 	// 0b -> 2
159 	// base# -> base (where 2 <= base <= 36)
160 	// case-sensitive, no whitespace allowed
161 	if let Ok(((), input)) = parse_fixed_char(input, '0') {
162 		let (ch, input) = parse_char(input)?;
163 		Ok((Base::from_zero_based_prefix_char(ch)?, input))
164 	} else {
165 		let mut custom_base: u8 = 0;
166 		let ((), input) = parse_integer(input, false, Base::default(), &mut |digit| -> Result<
167 			(),
168 			FendError,
169 		> {
170 			let error = FendError::BaseTooLarge;
171 			if custom_base > 3 {
172 				return Err(error);
173 			}
174 			custom_base = 10 * custom_base + digit;
175 			if custom_base > 36 {
176 				return Err(error);
177 			}
178 			Ok(())
179 		})?;
180 		if custom_base < 2 {
181 			return Err(FendError::BaseTooSmall);
182 		}
183 		let ((), input) = parse_fixed_char(input, '#')?;
184 		Ok((Base::from_custom_base(custom_base)?, input))
185 	}
186 }
187 
188 // Try and parse recurring digits in parentheses.
189 // '1.0(0)' -> success
190 // '1.0(a)', '1.0( 0)' -> Ok, but not parsed
191 // '1.0(3a)' -> FendError
192 
parse_recurring_digits<'a, I: Interrupt>( input: &'a str, number: &mut Number, num_nonrec_digits: usize, base: Base, int: &I, ) -> FResult<((), &'a str)>193 fn parse_recurring_digits<'a, I: Interrupt>(
194 	input: &'a str,
195 	number: &mut Number,
196 	num_nonrec_digits: usize,
197 	base: Base,
198 	int: &I,
199 ) -> FResult<((), &'a str)> {
200 	let original_input = input;
201 	// If there's no '(': return Ok but don't parse anything
202 	if parse_fixed_char(input, '(').is_err() {
203 		return Ok(((), original_input));
204 	}
205 	let ((), input) = parse_fixed_char(input, '(')?;
206 	if parse_ascii_digit(input, base).is_err() {
207 		// return Ok if there were no digits
208 		return Ok(((), original_input));
209 	}
210 	let mut recurring_number_num = Number::from(0);
211 	let mut recurring_number_den = Number::from(1);
212 	let base_as_u64 = u64::from(base.base_as_u8());
213 	let ((), input) = parse_integer(input, true, base, &mut |digit| -> FResult<()> {
214 		let digit_as_u64 = u64::from(digit);
215 		recurring_number_num = recurring_number_num
216 			.clone()
217 			.mul(base_as_u64.into(), int)?
218 			.add(digit_as_u64.into(), int)?;
219 		recurring_number_den = recurring_number_den.clone().mul(base_as_u64.into(), int)?;
220 		Ok(())
221 	})?;
222 	recurring_number_den = recurring_number_den.clone().sub(1.into(), int)?;
223 	for _ in 0..num_nonrec_digits {
224 		recurring_number_den = recurring_number_den.clone().mul(base_as_u64.into(), int)?;
225 	}
226 	*number = number
227 		.clone()
228 		.add(recurring_number_num.div(recurring_number_den, int)?, int)?;
229 	// return an error if there are any other characters before the closing parentheses
230 	let ((), input) = parse_fixed_char(input, ')')?;
231 	Ok(((), input))
232 }
233 
234 #[allow(clippy::too_many_lines)]
parse_basic_number<'a, I: Interrupt>( mut input: &'a str, base: Base, int: &I, ) -> FResult<(Number, &'a str)>235 fn parse_basic_number<'a, I: Interrupt>(
236 	mut input: &'a str,
237 	base: Base,
238 	int: &I,
239 ) -> FResult<(Number, &'a str)> {
240 	let mut is_dice_with_no_count = false;
241 	if input.starts_with('d') && base.base_as_u8() <= 10 {
242 		let mut chars = input.chars();
243 		chars.next();
244 		let following = chars.next();
245 		if following.is_some() && following.unwrap().is_ascii_digit() {
246 			is_dice_with_no_count = true;
247 		}
248 	}
249 
250 	// parse integer component
251 	let mut res = Number::zero_with_base(base);
252 	let base_as_u64 = u64::from(base.base_as_u8());
253 	let mut is_integer = true;
254 
255 	if parse_fixed_char(input, '.').is_err() && !is_dice_with_no_count {
256 		let ((), remaining) = parse_integer(input, true, base, &mut |digit| -> FResult<()> {
257 			res = res
258 				.clone()
259 				.mul(base_as_u64.into(), int)?
260 				.add(u64::from(digit).into(), int)?;
261 			Ok(())
262 		})?;
263 		input = remaining;
264 	}
265 
266 	// parse decimal point and at least one digit
267 	if let Ok(((), remaining)) = parse_fixed_char(input, '.') {
268 		is_integer = false;
269 		let mut num_nonrec_digits = 0;
270 		let mut numerator = Number::zero_with_base(base);
271 		let mut denominator = Number::zero_with_base(base).add(1.into(), int)?;
272 		if parse_fixed_char(remaining, '(').is_err() {
273 			let ((), remaining) = parse_integer(remaining, true, base, &mut |digit| -> Result<
274 				(),
275 				FendError,
276 			> {
277 				numerator = numerator
278 					.clone()
279 					.mul(base_as_u64.into(), int)?
280 					.add(u64::from(digit).into(), int)?;
281 				denominator = denominator.clone().mul(base_as_u64.into(), int)?;
282 				num_nonrec_digits += 1;
283 				Ok(())
284 			})?;
285 			input = remaining;
286 		} else {
287 			input = remaining;
288 		}
289 		res = res.add(numerator.div(denominator, int)?, int)?;
290 
291 		// try parsing recurring decimals
292 		let ((), remaining) =
293 			parse_recurring_digits(input, &mut res, num_nonrec_digits, base, int)?;
294 		input = remaining;
295 	}
296 
297 	// parse dice syntax
298 	if is_integer && base.base_as_u8() <= 10 {
299 		if let Ok(((), remaining)) = parse_fixed_char(input, 'd') {
300 			// peek to see if there's a digit immediately after the `d`:
301 			if parse_ascii_digit(remaining, base).is_ok() {
302 				let dice_count: u32 = if is_dice_with_no_count {
303 					1
304 				} else {
305 					convert::TryFrom::try_from(res.try_as_usize(int)?)
306 						.map_err(|_| FendError::InvalidDiceSyntax)?
307 				};
308 				let mut face_count = 0_u32;
309 				let ((), remaining2) =
310 					parse_integer(remaining, false, base, &mut |digit| -> FResult<()> {
311 						face_count = face_count
312 							.checked_mul(base.base_as_u8().into())
313 							.ok_or(FendError::InvalidDiceSyntax)?
314 							.checked_add(digit.into())
315 							.ok_or(FendError::InvalidDiceSyntax)?;
316 						Ok(())
317 					})?;
318 				if dice_count == 0 || face_count == 0 {
319 					return Err(FendError::InvalidDiceSyntax);
320 				}
321 				res = Number::new_die(dice_count, face_count, int)?;
322 				res = res.with_base(base);
323 				return Ok((res, remaining2));
324 			}
325 		}
326 	}
327 
328 	// parse optional exponent, but only for base 10 and below
329 	if base.base_as_u8() <= 10 {
330 		let (parsed_exponent, remaining) = if let Ok(((), remaining)) = parse_fixed_char(input, 'e')
331 		{
332 			(true, remaining)
333 		} else if let Ok(((), remaining)) = parse_fixed_char(input, 'E') {
334 			(true, remaining)
335 		} else {
336 			(false, "")
337 		};
338 
339 		if parsed_exponent {
340 			// peek ahead to the next char to determine if we should continue parsing an exponent
341 			let abort = if let Ok((ch, _)) = parse_char(remaining) {
342 				// abort if there is a non-digit non-plus or minus char after 'e',
343 				// such as '(', '/' or 'a'. Note that this is only parsed in base <= 10,
344 				// so letters can never be digits. We do want to include all digits even for
345 				// base < 10 though to avoid 6#3e9 from being valid.
346 				!(ch.is_ascii_digit() || ch == '+' || ch == '-')
347 			} else {
348 				// if there is no more input after the 'e', abort
349 				true
350 			};
351 			if !abort {
352 				input = remaining;
353 				let mut negative_exponent = false;
354 				if let Ok(((), remaining)) = parse_fixed_char(input, '-') {
355 					negative_exponent = true;
356 					input = remaining;
357 				} else if let Ok(((), remaining)) = parse_fixed_char(input, '+') {
358 					input = remaining;
359 				}
360 				let mut exp = Number::zero_with_base(base);
361 				let base_num = Number::from(u64::from(base.base_as_u8()));
362 				let ((), remaining2) =
363 					parse_integer(input, true, base, &mut |digit| -> FResult<()> {
364 						exp = (exp.clone().mul(base_num.clone(), int)?)
365 							.add(u64::from(digit).into(), int)?;
366 						Ok(())
367 					})?;
368 				if negative_exponent {
369 					exp = -exp;
370 				}
371 				let base_as_number: Number = base_as_u64.into();
372 				res = res.mul(base_as_number.pow(exp, int)?, int)?;
373 				input = remaining2;
374 			}
375 		}
376 	}
377 
378 	// parse exponentiation via unicode superscript digits
379 	if base.base_as_u8() <= 10
380 		&& input
381 			.chars()
382 			.next()
383 			.is_some_and(|c| SUPERSCRIPT_DIGITS.contains(&c))
384 	{
385 		if let Ok((mut power_digits, remaining)) = parse_power_number(input) {
386 			let mut exponent = Number::zero_with_base(base);
387 
388 			power_digits.reverse();
389 
390 			for (i, digit) in power_digits.into_iter().enumerate() {
391 				let num = digit * 10u64.pow(u32::try_from(i).unwrap());
392 				exponent = exponent.add(num.into(), int)?;
393 			}
394 
395 			res = res.pow(exponent, int)?;
396 			input = remaining;
397 		}
398 	}
399 
400 	Ok((res, input))
401 }
402 
403 const SUPERSCRIPT_DIGITS: [char; 10] = ['⁰', '¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹'];
404 
parse_power_number(input: &str) -> FResult<(Vec<u64>, &str)>405 fn parse_power_number(input: &str) -> FResult<(Vec<u64>, &str)> {
406 	let mut digits: Vec<u64> = Vec::new();
407 
408 	let (mut ch, mut input) = parse_char(input)?;
409 	while let Some((idx, _)) = SUPERSCRIPT_DIGITS
410 		.iter()
411 		.enumerate()
412 		.find(|(_, x)| **x == ch)
413 	{
414 		digits.push(idx as u64);
415 		if input.is_empty() {
416 			break;
417 		}
418 		(ch, input) = parse_char(input)?;
419 	}
420 
421 	Ok((digits, input))
422 }
423 
parse_number<'a, I: Interrupt>(input: &'a str, int: &I) -> FResult<(Number, &'a str)>424 fn parse_number<'a, I: Interrupt>(input: &'a str, int: &I) -> FResult<(Number, &'a str)> {
425 	let (base, input) = parse_base_prefix(input).unwrap_or((Base::default(), input));
426 	let (res, input) = parse_basic_number(input, base, int)?;
427 	Ok((res, input))
428 }
429 
is_valid_in_ident(ch: char, prev: Option<char>) -> bool430 fn is_valid_in_ident(ch: char, prev: Option<char>) -> bool {
431 	let allowed_chars = [
432 		',', '_', '⅛', '¼', '⅜', '½', '⅝', '¾', '⅞', '⅙', '⅓', '⅔', '⅚', '⅕', '⅖', '⅗', '⅘', '°',
433 		'$', '℃', '℉', '℧', '℈', '℥', '℔', '¢', '£', '¥', '€', '₩', '₪', '₤', '₨', '฿', '₡', '₣',
434 		'₦', '₧', '₫', '₭', '₮', '₯', '₱', '﷼', '﹩', '¢', '£', '¥', '₩', '㍱', '㍲', '㍳',
435 		'㍴', '㍶', '㎀', '㎁', '㎂', '㎃', '㎄', '㎅', '㎆', '㎇', '㎈', '㎉', '㎊', '㎋', '㎌',
436 		'㎍', '㎎', '㎏', '㎐', '㎑', '㎒', '㎓', '㎔', '㎕', '㎖', '㎗', '㎘', '㎙', '㎚', '㎛',
437 		'㎜', '㎝', '㎞', '㎟', '㎠', '㎡', '㎢', '㎣', '㎤', '㎥', '㎦', '㎧', '㎨', '㎩', '㎪',
438 		'㎫', '㎬', '㎭', '㎮', '㎯', '㎰', '㎱', '㎲', '㎳', '㎴', '㎵', '㎶', '㎷', '㎸', '㎹',
439 		'㎺', '㎻', '㎼', '㎽', '㎾', '㎿', '㏀', '㏁', '㏃', '㏄', '㏅', '㏆', '㏈', '㏉', '㏊',
440 		'㏌', '㏏', '㏐', '㏓', '㏔', '㏕', '㏖', '㏗', '㏙', '㏛', '㏜', '㏝',
441 	];
442 	let only_valid_by_themselves = ['%', '‰', '‱', '′', '″', '’', '”', 'π'];
443 	let split_on_subsequent_digit = ['$', '£', '¥'];
444 	let always_invalid = ['λ'];
445 	if always_invalid.contains(&ch) {
446 		false
447 	} else if only_valid_by_themselves.contains(&ch) {
448 		// these are only valid if there was no previous char
449 		prev.is_none()
450 	} else if only_valid_by_themselves.contains(&prev.unwrap_or('a')) {
451 		// if prev was a char that's only valid by itself, then this next
452 		// char cannot be part of an identifier
453 		false
454 	} else if ch.is_alphabetic() || allowed_chars.contains(&ch) {
455 		true
456 	} else {
457 		// these are valid only if there was a previous non-$ char in this identifier
458 		prev.is_some()
459 			&& !(split_on_subsequent_digit.contains(&prev.unwrap_or('a')))
460 			&& ".0123456789'\"".contains(ch)
461 	}
462 }
463 
parse_ident(input: &str, allow_dots: bool) -> FResult<(Token, &str)>464 fn parse_ident(input: &str, allow_dots: bool) -> FResult<(Token, &str)> {
465 	let (first_char, _) = parse_char(input)?;
466 	if !is_valid_in_ident(first_char, None) || first_char == '.' && !allow_dots {
467 		return Err(FendError::InvalidCharAtBeginningOfIdent(first_char));
468 	}
469 	let mut byte_idx = first_char.len_utf8();
470 	let (_, mut remaining) = input.split_at(byte_idx);
471 	let mut prev_char = first_char;
472 	while let Ok((next_char, remaining_input)) = parse_char(remaining) {
473 		if !is_valid_in_ident(next_char, Some(prev_char)) || next_char == '.' && !allow_dots {
474 			break;
475 		}
476 		remaining = remaining_input;
477 		byte_idx += next_char.len_utf8();
478 		prev_char = next_char;
479 	}
480 	let (ident, input) = input.split_at(byte_idx);
481 	Ok((
482 		match ident {
483 			"to" | "as" | "in" => Token::Symbol(Symbol::UnitConversion),
484 			"per" => Token::Symbol(Symbol::Div),
485 			"of" => Token::Symbol(Symbol::Of),
486 			"mod" => Token::Symbol(Symbol::Mod),
487 			"xor" | "XOR" => Token::Symbol(Symbol::BitwiseXor),
488 			"and" | "AND" => Token::Symbol(Symbol::BitwiseAnd),
489 			"or" | "OR" => Token::Symbol(Symbol::BitwiseOr),
490 			"nCr" | "choose" => Token::Symbol(Symbol::Combination),
491 			"nPr" | "permute" => Token::Symbol(Symbol::Permutation),
492 			_ => Token::Ident(Ident::new_string(ident.to_string())),
493 		},
494 		input,
495 	))
496 }
497 
parse_symbol(ch: char, input: &mut &str) -> FResult<Token>498 fn parse_symbol(ch: char, input: &mut &str) -> FResult<Token> {
499 	let mut test_next = |next: char| {
500 		if input.starts_with(next) {
501 			let (_, remaining) = input.split_at(next.len_utf8());
502 			*input = remaining;
503 			true
504 		} else {
505 			false
506 		}
507 	};
508 	Ok(Token::Symbol(match ch {
509 		'(' => Symbol::OpenParens,
510 		')' => Symbol::CloseParens,
511 		'+' => Symbol::Add,
512 		'!' => {
513 			if test_next('=') {
514 				Symbol::NotEquals
515 			} else {
516 				Symbol::Factorial
517 			}
518 		}
519 		// unicode minus sign
520 		'-' | '\u{2212}' => Symbol::Sub,
521 		'*' | '\u{d7}' | '\u{2715}' => {
522 			if test_next('*') {
523 				Symbol::Pow
524 			} else {
525 				Symbol::Mul
526 			}
527 		}
528 		'/' | '\u{f7}' | '\u{2215}' => Symbol::Div, // unicode division symbol and slash
529 		'^' => Symbol::Pow,
530 		'&' => Symbol::BitwiseAnd,
531 		'|' => Symbol::BitwiseOr,
532 		':' => Symbol::Fn,
533 		'=' => {
534 			if test_next('>') {
535 				Symbol::Fn
536 			} else if test_next('=') {
537 				Symbol::DoubleEquals
538 			} else {
539 				Symbol::Equals
540 			}
541 		}
542 		'\u{2260}' => Symbol::NotEquals,       // unicode not equal to symbol
543 		'\\' | '\u{3bb}' => Symbol::Backslash, // lambda symbol
544 		'.' => Symbol::Dot,
545 		'<' => {
546 			if test_next('<') {
547 				Symbol::ShiftLeft
548 			} else if test_next('>') {
549 				Symbol::NotEquals
550 			} else {
551 				return Err(FendError::UnexpectedChar(ch));
552 			}
553 		}
554 		'>' => {
555 			if test_next('>') {
556 				Symbol::ShiftRight
557 			} else {
558 				return Err(FendError::UnexpectedChar(ch));
559 			}
560 		}
561 		';' => Symbol::Semicolon,
562 		_ => return Err(FendError::UnexpectedChar(ch)),
563 	}))
564 }
565 
parse_unicode_escape(chars_iter: &mut std::str::CharIndices<'_>) -> FResult<char>566 fn parse_unicode_escape(chars_iter: &mut std::str::CharIndices<'_>) -> FResult<char> {
567 	if chars_iter
568 		.next()
569 		.ok_or(FendError::UnterminatedStringLiteral)?
570 		.1 != '{'
571 	{
572 		return Err(FendError::InvalidUnicodeEscapeSequence);
573 	}
574 	let mut result_value = 0;
575 	let mut zero_length = true;
576 	loop {
577 		let (_, ch) = chars_iter
578 			.next()
579 			.ok_or(FendError::UnterminatedStringLiteral)?;
580 		if ch.is_ascii_hexdigit() {
581 			zero_length = false;
582 			result_value *= 16;
583 			result_value += ch
584 				.to_digit(16)
585 				.ok_or(FendError::InvalidUnicodeEscapeSequence)?;
586 			if result_value > 0x10_ffff {
587 				return Err(FendError::InvalidUnicodeEscapeSequence);
588 			}
589 		} else if ch == '}' {
590 			break;
591 		} else {
592 			return Err(FendError::InvalidUnicodeEscapeSequence);
593 		}
594 	}
595 	if zero_length {
596 		return Err(FendError::InvalidUnicodeEscapeSequence);
597 	}
598 	if let Ok(ch) = <char as convert::TryFrom<u32>>::try_from(result_value) {
599 		Ok(ch)
600 	} else {
601 		Err(FendError::InvalidUnicodeEscapeSequence)
602 	}
603 }
604 
parse_string_literal(input: &str, terminator: char) -> FResult<(Token, &str)>605 fn parse_string_literal(input: &str, terminator: char) -> FResult<(Token, &str)> {
606 	let (_, input) = input.split_at(1);
607 	let mut chars_iter = input.char_indices();
608 	let mut literal_length = None;
609 	let mut literal_string = String::new();
610 	let mut skip_whitespace = false;
611 	while let Some((idx, ch)) = chars_iter.next() {
612 		if skip_whitespace {
613 			if ch.is_ascii_whitespace() {
614 				continue;
615 			}
616 			skip_whitespace = false;
617 		}
618 		if ch == terminator {
619 			literal_length = Some(idx);
620 			break;
621 		}
622 		if ch == '\\' {
623 			let (_, next) = chars_iter
624 				.next()
625 				.ok_or(FendError::UnterminatedStringLiteral)?;
626 			let escaped_char = match next {
627 				'\\' => Some('\\'),
628 				'"' => Some('"'),
629 				'\'' => Some('\''),
630 				'a' => Some('\u{7}'),  // bell
631 				'b' => Some('\u{8}'),  // backspace
632 				'e' => Some('\u{1b}'), // escape
633 				'f' => Some('\u{c}'),  // form feed
634 				'n' => Some('\n'),     // line feed
635 				'r' => Some('\r'),     // carriage return
636 				't' => Some('\t'),     // tab
637 				'v' => Some('\u{0b}'), // vertical tab
638 				'x' => {
639 					// two-character hex code
640 					let (_, hex1) = chars_iter
641 						.next()
642 						.ok_or(FendError::UnterminatedStringLiteral)?;
643 					let (_, hex2) = chars_iter
644 						.next()
645 						.ok_or(FendError::UnterminatedStringLiteral)?;
646 					let hex1: u8 = convert::TryInto::try_into(
647 						hex1.to_digit(8).ok_or(FendError::BackslashXOutOfRange)?,
648 					)
649 					.unwrap();
650 					let hex2: u8 = convert::TryInto::try_into(
651 						hex2.to_digit(16).ok_or(FendError::BackslashXOutOfRange)?,
652 					)
653 					.unwrap();
654 					Some((hex1 * 16 + hex2) as char)
655 				}
656 				'u' => Some(parse_unicode_escape(&mut chars_iter)?),
657 				'z' => {
658 					skip_whitespace = true;
659 					None
660 				}
661 				'^' => {
662 					// control character escapes
663 					let (_, letter) = chars_iter
664 						.next()
665 						.ok_or(FendError::UnterminatedStringLiteral)?;
666 					let code = letter as u8;
667 					if !(63..=95).contains(&code) {
668 						return Err(FendError::ExpectedALetterOrCode);
669 					}
670 					Some(if code == b'?' {
671 						'\x7f'
672 					} else {
673 						(code - 64) as char
674 					})
675 				}
676 				_ => return Err(FendError::UnknownBackslashEscapeSequence(next)),
677 			};
678 			if let Some(escaped_char) = escaped_char {
679 				literal_string.push(escaped_char);
680 			}
681 		} else {
682 			literal_string.push(ch);
683 		}
684 	}
685 	let literal_length = literal_length.ok_or(FendError::UnterminatedStringLiteral)?;
686 	let (_, remaining) = input.split_at(literal_length + 1);
687 	Ok((Token::StringLiteral(literal_string.into()), remaining))
688 }
689 
690 // parses a unit beginning with ' or "
parse_quote_unit(input: &str) -> (Token, &str)691 fn parse_quote_unit(input: &str) -> (Token, &str) {
692 	let mut split_idx = 1;
693 	if let Some(ch) = input.split_at(1).1.chars().next() {
694 		if ch.is_alphabetic() {
695 			split_idx += ch.len_utf8();
696 			let mut prev = ch;
697 			let (_, mut remaining) = input.split_at(split_idx);
698 			while let Some(next) = remaining.chars().next() {
699 				if !is_valid_in_ident(next, Some(prev)) {
700 					break;
701 				}
702 				split_idx += next.len_utf8();
703 				prev = next;
704 				let (_, remaining2) = input.split_at(split_idx);
705 				remaining = remaining2;
706 			}
707 		}
708 	}
709 	let (a, b) = input.split_at(split_idx);
710 	(Token::Ident(Ident::new_string(a.to_string())), b)
711 }
712 
713 pub(crate) struct Lexer<'a, 'b, I: Interrupt> {
714 	input: &'a str,
715 	// normally 0; 1 after backslash; 2 after ident after backslash
716 	after_backslash_state: u8,
717 	after_number_or_to: bool,
718 	int: &'b I,
719 }
720 
skip_whitespace_and_comments(input: &mut &str)721 fn skip_whitespace_and_comments(input: &mut &str) {
722 	while !input.is_empty() {
723 		if input.starts_with("# ") || input.starts_with("#!") {
724 			if let Some(idx) = input.find('\n') {
725 				let (_, remaining) = input.split_at(idx);
726 				*input = remaining;
727 				continue;
728 			}
729 			*input = "";
730 			return;
731 		} else if let Some(ch) = input.chars().next() {
732 			if ch.is_whitespace() {
733 				let (_, remaining) = input.split_at(ch.len_utf8());
734 				*input = remaining;
735 				continue;
736 			}
737 		}
738 		break;
739 	}
740 }
741 
parse_date(input: &str) -> FResult<(Date, &str)>742 fn parse_date(input: &str) -> FResult<(Date, &str)> {
743 	let (_, input) = input.split_at(1); // skip '@' symbol
744 	let mut input2 = input;
745 	let mut split_idx = 0;
746 	for i in 0..3 {
747 		let mut n = 0;
748 		while matches!(input2.chars().next(), Some('0'..='9')) {
749 			let (_, remaining) = input2.split_at(1);
750 			input2 = remaining;
751 			n += 1;
752 			split_idx += 1;
753 		}
754 		if n == 0 {
755 			return Err(FendError::ExpectedADateLiteral);
756 		}
757 		if i == 2 {
758 			break;
759 		}
760 		if !input2.starts_with('-') {
761 			return Err(FendError::ExpectedADateLiteral);
762 		}
763 		let (_, remaining) = input2.split_at(1);
764 		input2 = remaining;
765 		split_idx += 1;
766 	}
767 	let (date_str, result_remaining) = input.split_at(split_idx);
768 	let res = Date::parse(date_str)?;
769 	Ok((res, result_remaining))
770 }
771 
772 impl<'a, 'b, I: Interrupt> Lexer<'a, 'b, I> {
next_token(&mut self) -> FResult<Option<Token>>773 	fn next_token(&mut self) -> FResult<Option<Token>> {
774 		skip_whitespace_and_comments(&mut self.input);
775 		let (ch, following) = {
776 			let mut chars = self.input.chars();
777 			let ch = chars.next();
778 			let following = chars.next();
779 			(ch, following)
780 		};
781 		Ok(Some(match ch {
782 			Some(ch) => {
783 				if ch.is_ascii_digit()
784 					|| (ch == '.' && self.after_backslash_state == 0)
785 					|| (ch == 'd' && following.is_some() && following.unwrap().is_ascii_digit())
786 				{
787 					let (num, remaining) = parse_number(self.input, self.int)?;
788 					self.input = remaining;
789 					Token::Num(num)
790 				} else if ch == '\'' || ch == '"' {
791 					if self.after_number_or_to {
792 						let (token, remaining) = parse_quote_unit(self.input);
793 						self.input = remaining;
794 						token
795 					} else {
796 						// normal string literal, with possible escape sequences
797 						let (token, remaining) = parse_string_literal(self.input, ch)?;
798 						self.input = remaining;
799 						token
800 					}
801 				} else if ch == '@' {
802 					// date literal, e.g. @1970-01-01
803 					let (date, remaining) = parse_date(self.input)?;
804 					self.input = remaining;
805 					Token::Date(date)
806 				} else if self.input.starts_with("#\"") {
807 					// raw string literal
808 					let (_, remaining) = self.input.split_at(2);
809 					let literal_length = remaining
810 						.match_indices("\"#")
811 						.next()
812 						.ok_or(FendError::UnterminatedStringLiteral)?
813 						.0;
814 					let (literal, remaining) = remaining.split_at(literal_length);
815 					let (_terminator, remaining) = remaining.split_at(2);
816 					self.input = remaining;
817 					Token::StringLiteral(literal.to_string().into())
818 				} else if is_valid_in_ident(ch, None) {
819 					// dots aren't allowed in idents after a backslash
820 					let (ident, remaining) =
821 						parse_ident(self.input, self.after_backslash_state != 1)?;
822 					self.input = remaining;
823 					ident
824 				} else {
825 					let (_, remaining) = self.input.split_at(ch.len_utf8());
826 					self.input = remaining;
827 					parse_symbol(ch, &mut self.input)?
828 				}
829 			}
830 			None => return Ok(None),
831 		}))
832 	}
833 }
834 
835 impl<'a, I: Interrupt> Iterator for Lexer<'a, '_, I> {
836 	type Item = FResult<Token>;
837 
next(&mut self) -> Option<Self::Item>838 	fn next(&mut self) -> Option<Self::Item> {
839 		let res = match self.next_token() {
840 			Err(e) => Some(Err(e)),
841 			Ok(None) => None,
842 			Ok(Some(t)) => Some(Ok(t)),
843 		};
844 		self.after_number_or_to = matches!(
845 			res,
846 			Some(Ok(Token::Num(_) | Token::Symbol(Symbol::UnitConversion)))
847 		);
848 		if matches!(res, Some(Ok(Token::Symbol(Symbol::Backslash)))) {
849 			self.after_backslash_state = 1;
850 		} else if self.after_backslash_state == 1 {
851 			if let Some(Ok(Token::Ident(_))) = res {
852 				self.after_backslash_state = 2;
853 			} else {
854 				self.after_backslash_state = 0;
855 			}
856 		} else {
857 			self.after_backslash_state = 0;
858 		}
859 		res
860 	}
861 }
862 
lex<'a, 'b, I: Interrupt>(input: &'a str, int: &'b I) -> Lexer<'a, 'b, I>863 pub(crate) fn lex<'a, 'b, I: Interrupt>(input: &'a str, int: &'b I) -> Lexer<'a, 'b, I> {
864 	Lexer {
865 		input,
866 		after_backslash_state: 0,
867 		after_number_or_to: false,
868 		int,
869 	}
870 }
871