1 use crate::date::Date;
2 use crate::error::{FendError, Interrupt};
3 use crate::ident::Ident;
4 use crate::num::{Base, Number};
5 use crate::result::FResult;
6 use std::{borrow, convert, fmt};
7
8 #[derive(Clone, Debug)]
9 pub(crate) enum Token {
10 Num(Number),
11 Ident(Ident),
12 Symbol(Symbol),
13 StringLiteral(borrow::Cow<'static, str>),
14 Date(Date),
15 }
16
17 #[derive(PartialEq, Eq, Copy, Clone, Debug)]
18 pub(crate) enum Symbol {
19 OpenParens,
20 CloseParens,
21 Add,
22 Sub,
23 Mul,
24 Div,
25 Mod,
26 Pow,
27 BitwiseAnd,
28 BitwiseOr,
29 BitwiseXor,
30 UnitConversion,
31 Factorial,
32 Fn,
33 Backslash,
34 Dot,
35 Of,
36 ShiftLeft,
37 ShiftRight,
38 Semicolon,
39 Equals, // used for assignment
40 DoubleEquals, // used for equality
41 NotEquals,
42 Combination,
43 Permutation,
44 }
45
46 impl fmt::Display for Symbol {
fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error>47 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
48 let s = match self {
49 Self::OpenParens => "(",
50 Self::CloseParens => ")",
51 Self::Add => "+",
52 Self::Sub => "-",
53 Self::Mul => "*",
54 Self::Div => "/",
55 Self::Mod => "mod",
56 Self::Pow => "^",
57 Self::BitwiseAnd => "&",
58 Self::BitwiseOr => "|",
59 Self::BitwiseXor => " xor ",
60 Self::UnitConversion => "to",
61 Self::Factorial => "!",
62 Self::Fn => ":",
63 Self::Backslash => "\"",
64 Self::Dot => ".",
65 Self::Of => "of",
66 Self::ShiftLeft => "<<",
67 Self::ShiftRight => ">>",
68 Self::Semicolon => ";",
69 Self::Equals => "=",
70 Self::DoubleEquals => "==",
71 Self::NotEquals => "!=",
72 Self::Combination => "nCr",
73 Self::Permutation => "nPr",
74 };
75 write!(f, "{s}")?;
76 Ok(())
77 }
78 }
79
parse_char(input: &str) -> FResult<(char, &str)>80 fn parse_char(input: &str) -> FResult<(char, &str)> {
81 input
82 .chars()
83 .next()
84 .map_or(Err(FendError::ExpectedACharacter), |ch| {
85 let (_, b) = input.split_at(ch.len_utf8());
86 Ok((ch, b))
87 })
88 }
89
parse_ascii_digit(input: &str, base: Base) -> FResult<(u8, &str)>90 fn parse_ascii_digit(input: &str, base: Base) -> FResult<(u8, &str)> {
91 let (ch, input) = parse_char(input)?;
92 let possible_digit = ch.to_digit(base.base_as_u8().into());
93 possible_digit
94 .and_then(|d| <u32 as convert::TryInto<u8>>::try_into(d).ok())
95 .map_or(Err(FendError::ExpectedADigit(ch)), |digit| {
96 Ok((digit, input))
97 })
98 }
99
parse_fixed_char(input: &str, ch: char) -> FResult<((), &str)>100 fn parse_fixed_char(input: &str, ch: char) -> FResult<((), &str)> {
101 let (parsed_ch, input) = parse_char(input)?;
102 if parsed_ch == ch {
103 Ok(((), input))
104 } else {
105 Err(FendError::ExpectedChar(ch, parsed_ch))
106 }
107 }
108
parse_digit_separator(input: &str) -> FResult<((), &str)>109 fn parse_digit_separator(input: &str) -> FResult<((), &str)> {
110 let (parsed_ch, input) = parse_char(input)?;
111 if parsed_ch == '_' || parsed_ch == ',' {
112 Ok(((), input))
113 } else {
114 Err(FendError::ExpectedDigitSeparator(parsed_ch))
115 }
116 }
117
118 // Parses a plain integer with no whitespace and no base prefix.
119 // Leading minus sign is not allowed.
parse_integer<'a, E: From<FendError>>( input: &'a str, allow_digit_separator: bool, base: Base, process_digit: &mut impl FnMut(u8) -> Result<(), E>, ) -> Result<((), &'a str), E>120 fn parse_integer<'a, E: From<FendError>>(
121 input: &'a str,
122 allow_digit_separator: bool,
123 base: Base,
124 process_digit: &mut impl FnMut(u8) -> Result<(), E>,
125 ) -> Result<((), &'a str), E> {
126 let (digit, mut input) = parse_ascii_digit(input, base)?;
127 process_digit(digit)?;
128 let mut parsed_digit_separator;
129 loop {
130 if let Ok(((), remaining)) = parse_digit_separator(input) {
131 input = remaining;
132 parsed_digit_separator = true;
133 if !allow_digit_separator {
134 return Err(FendError::DigitSeparatorsNotAllowed.into());
135 }
136 } else {
137 parsed_digit_separator = false;
138 }
139 match parse_ascii_digit(input, base) {
140 Err(_) => {
141 if parsed_digit_separator {
142 return Err(FendError::DigitSeparatorsOnlyBetweenDigits.into());
143 }
144 break;
145 }
146 Ok((digit, next_input)) => {
147 process_digit(digit)?;
148 input = next_input;
149 }
150 }
151 }
152 Ok(((), input))
153 }
154
parse_base_prefix(input: &str) -> FResult<(Base, &str)>155 fn parse_base_prefix(input: &str) -> FResult<(Base, &str)> {
156 // 0x -> 16
157 // 0o -> 8
158 // 0b -> 2
159 // base# -> base (where 2 <= base <= 36)
160 // case-sensitive, no whitespace allowed
161 if let Ok(((), input)) = parse_fixed_char(input, '0') {
162 let (ch, input) = parse_char(input)?;
163 Ok((Base::from_zero_based_prefix_char(ch)?, input))
164 } else {
165 let mut custom_base: u8 = 0;
166 let ((), input) = parse_integer(input, false, Base::default(), &mut |digit| -> Result<
167 (),
168 FendError,
169 > {
170 let error = FendError::BaseTooLarge;
171 if custom_base > 3 {
172 return Err(error);
173 }
174 custom_base = 10 * custom_base + digit;
175 if custom_base > 36 {
176 return Err(error);
177 }
178 Ok(())
179 })?;
180 if custom_base < 2 {
181 return Err(FendError::BaseTooSmall);
182 }
183 let ((), input) = parse_fixed_char(input, '#')?;
184 Ok((Base::from_custom_base(custom_base)?, input))
185 }
186 }
187
188 // Try and parse recurring digits in parentheses.
189 // '1.0(0)' -> success
190 // '1.0(a)', '1.0( 0)' -> Ok, but not parsed
191 // '1.0(3a)' -> FendError
192
parse_recurring_digits<'a, I: Interrupt>( input: &'a str, number: &mut Number, num_nonrec_digits: usize, base: Base, int: &I, ) -> FResult<((), &'a str)>193 fn parse_recurring_digits<'a, I: Interrupt>(
194 input: &'a str,
195 number: &mut Number,
196 num_nonrec_digits: usize,
197 base: Base,
198 int: &I,
199 ) -> FResult<((), &'a str)> {
200 let original_input = input;
201 // If there's no '(': return Ok but don't parse anything
202 if parse_fixed_char(input, '(').is_err() {
203 return Ok(((), original_input));
204 }
205 let ((), input) = parse_fixed_char(input, '(')?;
206 if parse_ascii_digit(input, base).is_err() {
207 // return Ok if there were no digits
208 return Ok(((), original_input));
209 }
210 let mut recurring_number_num = Number::from(0);
211 let mut recurring_number_den = Number::from(1);
212 let base_as_u64 = u64::from(base.base_as_u8());
213 let ((), input) = parse_integer(input, true, base, &mut |digit| -> FResult<()> {
214 let digit_as_u64 = u64::from(digit);
215 recurring_number_num = recurring_number_num
216 .clone()
217 .mul(base_as_u64.into(), int)?
218 .add(digit_as_u64.into(), int)?;
219 recurring_number_den = recurring_number_den.clone().mul(base_as_u64.into(), int)?;
220 Ok(())
221 })?;
222 recurring_number_den = recurring_number_den.clone().sub(1.into(), int)?;
223 for _ in 0..num_nonrec_digits {
224 recurring_number_den = recurring_number_den.clone().mul(base_as_u64.into(), int)?;
225 }
226 *number = number
227 .clone()
228 .add(recurring_number_num.div(recurring_number_den, int)?, int)?;
229 // return an error if there are any other characters before the closing parentheses
230 let ((), input) = parse_fixed_char(input, ')')?;
231 Ok(((), input))
232 }
233
234 #[allow(clippy::too_many_lines)]
parse_basic_number<'a, I: Interrupt>( mut input: &'a str, base: Base, int: &I, ) -> FResult<(Number, &'a str)>235 fn parse_basic_number<'a, I: Interrupt>(
236 mut input: &'a str,
237 base: Base,
238 int: &I,
239 ) -> FResult<(Number, &'a str)> {
240 let mut is_dice_with_no_count = false;
241 if input.starts_with('d') && base.base_as_u8() <= 10 {
242 let mut chars = input.chars();
243 chars.next();
244 let following = chars.next();
245 if following.is_some() && following.unwrap().is_ascii_digit() {
246 is_dice_with_no_count = true;
247 }
248 }
249
250 // parse integer component
251 let mut res = Number::zero_with_base(base);
252 let base_as_u64 = u64::from(base.base_as_u8());
253 let mut is_integer = true;
254
255 if parse_fixed_char(input, '.').is_err() && !is_dice_with_no_count {
256 let ((), remaining) = parse_integer(input, true, base, &mut |digit| -> FResult<()> {
257 res = res
258 .clone()
259 .mul(base_as_u64.into(), int)?
260 .add(u64::from(digit).into(), int)?;
261 Ok(())
262 })?;
263 input = remaining;
264 }
265
266 // parse decimal point and at least one digit
267 if let Ok(((), remaining)) = parse_fixed_char(input, '.') {
268 is_integer = false;
269 let mut num_nonrec_digits = 0;
270 let mut numerator = Number::zero_with_base(base);
271 let mut denominator = Number::zero_with_base(base).add(1.into(), int)?;
272 if parse_fixed_char(remaining, '(').is_err() {
273 let ((), remaining) = parse_integer(remaining, true, base, &mut |digit| -> Result<
274 (),
275 FendError,
276 > {
277 numerator = numerator
278 .clone()
279 .mul(base_as_u64.into(), int)?
280 .add(u64::from(digit).into(), int)?;
281 denominator = denominator.clone().mul(base_as_u64.into(), int)?;
282 num_nonrec_digits += 1;
283 Ok(())
284 })?;
285 input = remaining;
286 } else {
287 input = remaining;
288 }
289 res = res.add(numerator.div(denominator, int)?, int)?;
290
291 // try parsing recurring decimals
292 let ((), remaining) =
293 parse_recurring_digits(input, &mut res, num_nonrec_digits, base, int)?;
294 input = remaining;
295 }
296
297 // parse dice syntax
298 if is_integer && base.base_as_u8() <= 10 {
299 if let Ok(((), remaining)) = parse_fixed_char(input, 'd') {
300 // peek to see if there's a digit immediately after the `d`:
301 if parse_ascii_digit(remaining, base).is_ok() {
302 let dice_count: u32 = if is_dice_with_no_count {
303 1
304 } else {
305 convert::TryFrom::try_from(res.try_as_usize(int)?)
306 .map_err(|_| FendError::InvalidDiceSyntax)?
307 };
308 let mut face_count = 0_u32;
309 let ((), remaining2) =
310 parse_integer(remaining, false, base, &mut |digit| -> FResult<()> {
311 face_count = face_count
312 .checked_mul(base.base_as_u8().into())
313 .ok_or(FendError::InvalidDiceSyntax)?
314 .checked_add(digit.into())
315 .ok_or(FendError::InvalidDiceSyntax)?;
316 Ok(())
317 })?;
318 if dice_count == 0 || face_count == 0 {
319 return Err(FendError::InvalidDiceSyntax);
320 }
321 res = Number::new_die(dice_count, face_count, int)?;
322 res = res.with_base(base);
323 return Ok((res, remaining2));
324 }
325 }
326 }
327
328 // parse optional exponent, but only for base 10 and below
329 if base.base_as_u8() <= 10 {
330 let (parsed_exponent, remaining) = if let Ok(((), remaining)) = parse_fixed_char(input, 'e')
331 {
332 (true, remaining)
333 } else if let Ok(((), remaining)) = parse_fixed_char(input, 'E') {
334 (true, remaining)
335 } else {
336 (false, "")
337 };
338
339 if parsed_exponent {
340 // peek ahead to the next char to determine if we should continue parsing an exponent
341 let abort = if let Ok((ch, _)) = parse_char(remaining) {
342 // abort if there is a non-digit non-plus or minus char after 'e',
343 // such as '(', '/' or 'a'. Note that this is only parsed in base <= 10,
344 // so letters can never be digits. We do want to include all digits even for
345 // base < 10 though to avoid 6#3e9 from being valid.
346 !(ch.is_ascii_digit() || ch == '+' || ch == '-')
347 } else {
348 // if there is no more input after the 'e', abort
349 true
350 };
351 if !abort {
352 input = remaining;
353 let mut negative_exponent = false;
354 if let Ok(((), remaining)) = parse_fixed_char(input, '-') {
355 negative_exponent = true;
356 input = remaining;
357 } else if let Ok(((), remaining)) = parse_fixed_char(input, '+') {
358 input = remaining;
359 }
360 let mut exp = Number::zero_with_base(base);
361 let base_num = Number::from(u64::from(base.base_as_u8()));
362 let ((), remaining2) =
363 parse_integer(input, true, base, &mut |digit| -> FResult<()> {
364 exp = (exp.clone().mul(base_num.clone(), int)?)
365 .add(u64::from(digit).into(), int)?;
366 Ok(())
367 })?;
368 if negative_exponent {
369 exp = -exp;
370 }
371 let base_as_number: Number = base_as_u64.into();
372 res = res.mul(base_as_number.pow(exp, int)?, int)?;
373 input = remaining2;
374 }
375 }
376 }
377
378 // parse exponentiation via unicode superscript digits
379 if base.base_as_u8() <= 10
380 && input
381 .chars()
382 .next()
383 .is_some_and(|c| SUPERSCRIPT_DIGITS.contains(&c))
384 {
385 if let Ok((mut power_digits, remaining)) = parse_power_number(input) {
386 let mut exponent = Number::zero_with_base(base);
387
388 power_digits.reverse();
389
390 for (i, digit) in power_digits.into_iter().enumerate() {
391 let num = digit * 10u64.pow(u32::try_from(i).unwrap());
392 exponent = exponent.add(num.into(), int)?;
393 }
394
395 res = res.pow(exponent, int)?;
396 input = remaining;
397 }
398 }
399
400 Ok((res, input))
401 }
402
403 const SUPERSCRIPT_DIGITS: [char; 10] = ['⁰', '¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹'];
404
parse_power_number(input: &str) -> FResult<(Vec<u64>, &str)>405 fn parse_power_number(input: &str) -> FResult<(Vec<u64>, &str)> {
406 let mut digits: Vec<u64> = Vec::new();
407
408 let (mut ch, mut input) = parse_char(input)?;
409 while let Some((idx, _)) = SUPERSCRIPT_DIGITS
410 .iter()
411 .enumerate()
412 .find(|(_, x)| **x == ch)
413 {
414 digits.push(idx as u64);
415 if input.is_empty() {
416 break;
417 }
418 (ch, input) = parse_char(input)?;
419 }
420
421 Ok((digits, input))
422 }
423
parse_number<'a, I: Interrupt>(input: &'a str, int: &I) -> FResult<(Number, &'a str)>424 fn parse_number<'a, I: Interrupt>(input: &'a str, int: &I) -> FResult<(Number, &'a str)> {
425 let (base, input) = parse_base_prefix(input).unwrap_or((Base::default(), input));
426 let (res, input) = parse_basic_number(input, base, int)?;
427 Ok((res, input))
428 }
429
is_valid_in_ident(ch: char, prev: Option<char>) -> bool430 fn is_valid_in_ident(ch: char, prev: Option<char>) -> bool {
431 let allowed_chars = [
432 ',', '_', '⅛', '¼', '⅜', '½', '⅝', '¾', '⅞', '⅙', '⅓', '⅔', '⅚', '⅕', '⅖', '⅗', '⅘', '°',
433 '$', '℃', '℉', '℧', '℈', '℥', '℔', '¢', '£', '¥', '€', '₩', '₪', '₤', '₨', '฿', '₡', '₣',
434 '₦', '₧', '₫', '₭', '₮', '₯', '₱', '﷼', '﹩', '¢', '£', '¥', '₩', '㍱', '㍲', '㍳',
435 '㍴', '㍶', '㎀', '㎁', '㎂', '㎃', '㎄', '㎅', '㎆', '㎇', '㎈', '㎉', '㎊', '㎋', '㎌',
436 '㎍', '㎎', '㎏', '㎐', '㎑', '㎒', '㎓', '㎔', '㎕', '㎖', '㎗', '㎘', '㎙', '㎚', '㎛',
437 '㎜', '㎝', '㎞', '㎟', '㎠', '㎡', '㎢', '㎣', '㎤', '㎥', '㎦', '㎧', '㎨', '㎩', '㎪',
438 '㎫', '㎬', '㎭', '㎮', '㎯', '㎰', '㎱', '㎲', '㎳', '㎴', '㎵', '㎶', '㎷', '㎸', '㎹',
439 '㎺', '㎻', '㎼', '㎽', '㎾', '㎿', '㏀', '㏁', '㏃', '㏄', '㏅', '㏆', '㏈', '㏉', '㏊',
440 '㏌', '㏏', '㏐', '㏓', '㏔', '㏕', '㏖', '㏗', '㏙', '㏛', '㏜', '㏝',
441 ];
442 let only_valid_by_themselves = ['%', '‰', '‱', '′', '″', '’', '”', 'π'];
443 let split_on_subsequent_digit = ['$', '£', '¥'];
444 let always_invalid = ['λ'];
445 if always_invalid.contains(&ch) {
446 false
447 } else if only_valid_by_themselves.contains(&ch) {
448 // these are only valid if there was no previous char
449 prev.is_none()
450 } else if only_valid_by_themselves.contains(&prev.unwrap_or('a')) {
451 // if prev was a char that's only valid by itself, then this next
452 // char cannot be part of an identifier
453 false
454 } else if ch.is_alphabetic() || allowed_chars.contains(&ch) {
455 true
456 } else {
457 // these are valid only if there was a previous non-$ char in this identifier
458 prev.is_some()
459 && !(split_on_subsequent_digit.contains(&prev.unwrap_or('a')))
460 && ".0123456789'\"".contains(ch)
461 }
462 }
463
parse_ident(input: &str, allow_dots: bool) -> FResult<(Token, &str)>464 fn parse_ident(input: &str, allow_dots: bool) -> FResult<(Token, &str)> {
465 let (first_char, _) = parse_char(input)?;
466 if !is_valid_in_ident(first_char, None) || first_char == '.' && !allow_dots {
467 return Err(FendError::InvalidCharAtBeginningOfIdent(first_char));
468 }
469 let mut byte_idx = first_char.len_utf8();
470 let (_, mut remaining) = input.split_at(byte_idx);
471 let mut prev_char = first_char;
472 while let Ok((next_char, remaining_input)) = parse_char(remaining) {
473 if !is_valid_in_ident(next_char, Some(prev_char)) || next_char == '.' && !allow_dots {
474 break;
475 }
476 remaining = remaining_input;
477 byte_idx += next_char.len_utf8();
478 prev_char = next_char;
479 }
480 let (ident, input) = input.split_at(byte_idx);
481 Ok((
482 match ident {
483 "to" | "as" | "in" => Token::Symbol(Symbol::UnitConversion),
484 "per" => Token::Symbol(Symbol::Div),
485 "of" => Token::Symbol(Symbol::Of),
486 "mod" => Token::Symbol(Symbol::Mod),
487 "xor" | "XOR" => Token::Symbol(Symbol::BitwiseXor),
488 "and" | "AND" => Token::Symbol(Symbol::BitwiseAnd),
489 "or" | "OR" => Token::Symbol(Symbol::BitwiseOr),
490 "nCr" | "choose" => Token::Symbol(Symbol::Combination),
491 "nPr" | "permute" => Token::Symbol(Symbol::Permutation),
492 _ => Token::Ident(Ident::new_string(ident.to_string())),
493 },
494 input,
495 ))
496 }
497
parse_symbol(ch: char, input: &mut &str) -> FResult<Token>498 fn parse_symbol(ch: char, input: &mut &str) -> FResult<Token> {
499 let mut test_next = |next: char| {
500 if input.starts_with(next) {
501 let (_, remaining) = input.split_at(next.len_utf8());
502 *input = remaining;
503 true
504 } else {
505 false
506 }
507 };
508 Ok(Token::Symbol(match ch {
509 '(' => Symbol::OpenParens,
510 ')' => Symbol::CloseParens,
511 '+' => Symbol::Add,
512 '!' => {
513 if test_next('=') {
514 Symbol::NotEquals
515 } else {
516 Symbol::Factorial
517 }
518 }
519 // unicode minus sign
520 '-' | '\u{2212}' => Symbol::Sub,
521 '*' | '\u{d7}' | '\u{2715}' => {
522 if test_next('*') {
523 Symbol::Pow
524 } else {
525 Symbol::Mul
526 }
527 }
528 '/' | '\u{f7}' | '\u{2215}' => Symbol::Div, // unicode division symbol and slash
529 '^' => Symbol::Pow,
530 '&' => Symbol::BitwiseAnd,
531 '|' => Symbol::BitwiseOr,
532 ':' => Symbol::Fn,
533 '=' => {
534 if test_next('>') {
535 Symbol::Fn
536 } else if test_next('=') {
537 Symbol::DoubleEquals
538 } else {
539 Symbol::Equals
540 }
541 }
542 '\u{2260}' => Symbol::NotEquals, // unicode not equal to symbol
543 '\\' | '\u{3bb}' => Symbol::Backslash, // lambda symbol
544 '.' => Symbol::Dot,
545 '<' => {
546 if test_next('<') {
547 Symbol::ShiftLeft
548 } else if test_next('>') {
549 Symbol::NotEquals
550 } else {
551 return Err(FendError::UnexpectedChar(ch));
552 }
553 }
554 '>' => {
555 if test_next('>') {
556 Symbol::ShiftRight
557 } else {
558 return Err(FendError::UnexpectedChar(ch));
559 }
560 }
561 ';' => Symbol::Semicolon,
562 _ => return Err(FendError::UnexpectedChar(ch)),
563 }))
564 }
565
parse_unicode_escape(chars_iter: &mut std::str::CharIndices<'_>) -> FResult<char>566 fn parse_unicode_escape(chars_iter: &mut std::str::CharIndices<'_>) -> FResult<char> {
567 if chars_iter
568 .next()
569 .ok_or(FendError::UnterminatedStringLiteral)?
570 .1 != '{'
571 {
572 return Err(FendError::InvalidUnicodeEscapeSequence);
573 }
574 let mut result_value = 0;
575 let mut zero_length = true;
576 loop {
577 let (_, ch) = chars_iter
578 .next()
579 .ok_or(FendError::UnterminatedStringLiteral)?;
580 if ch.is_ascii_hexdigit() {
581 zero_length = false;
582 result_value *= 16;
583 result_value += ch
584 .to_digit(16)
585 .ok_or(FendError::InvalidUnicodeEscapeSequence)?;
586 if result_value > 0x10_ffff {
587 return Err(FendError::InvalidUnicodeEscapeSequence);
588 }
589 } else if ch == '}' {
590 break;
591 } else {
592 return Err(FendError::InvalidUnicodeEscapeSequence);
593 }
594 }
595 if zero_length {
596 return Err(FendError::InvalidUnicodeEscapeSequence);
597 }
598 if let Ok(ch) = <char as convert::TryFrom<u32>>::try_from(result_value) {
599 Ok(ch)
600 } else {
601 Err(FendError::InvalidUnicodeEscapeSequence)
602 }
603 }
604
parse_string_literal(input: &str, terminator: char) -> FResult<(Token, &str)>605 fn parse_string_literal(input: &str, terminator: char) -> FResult<(Token, &str)> {
606 let (_, input) = input.split_at(1);
607 let mut chars_iter = input.char_indices();
608 let mut literal_length = None;
609 let mut literal_string = String::new();
610 let mut skip_whitespace = false;
611 while let Some((idx, ch)) = chars_iter.next() {
612 if skip_whitespace {
613 if ch.is_ascii_whitespace() {
614 continue;
615 }
616 skip_whitespace = false;
617 }
618 if ch == terminator {
619 literal_length = Some(idx);
620 break;
621 }
622 if ch == '\\' {
623 let (_, next) = chars_iter
624 .next()
625 .ok_or(FendError::UnterminatedStringLiteral)?;
626 let escaped_char = match next {
627 '\\' => Some('\\'),
628 '"' => Some('"'),
629 '\'' => Some('\''),
630 'a' => Some('\u{7}'), // bell
631 'b' => Some('\u{8}'), // backspace
632 'e' => Some('\u{1b}'), // escape
633 'f' => Some('\u{c}'), // form feed
634 'n' => Some('\n'), // line feed
635 'r' => Some('\r'), // carriage return
636 't' => Some('\t'), // tab
637 'v' => Some('\u{0b}'), // vertical tab
638 'x' => {
639 // two-character hex code
640 let (_, hex1) = chars_iter
641 .next()
642 .ok_or(FendError::UnterminatedStringLiteral)?;
643 let (_, hex2) = chars_iter
644 .next()
645 .ok_or(FendError::UnterminatedStringLiteral)?;
646 let hex1: u8 = convert::TryInto::try_into(
647 hex1.to_digit(8).ok_or(FendError::BackslashXOutOfRange)?,
648 )
649 .unwrap();
650 let hex2: u8 = convert::TryInto::try_into(
651 hex2.to_digit(16).ok_or(FendError::BackslashXOutOfRange)?,
652 )
653 .unwrap();
654 Some((hex1 * 16 + hex2) as char)
655 }
656 'u' => Some(parse_unicode_escape(&mut chars_iter)?),
657 'z' => {
658 skip_whitespace = true;
659 None
660 }
661 '^' => {
662 // control character escapes
663 let (_, letter) = chars_iter
664 .next()
665 .ok_or(FendError::UnterminatedStringLiteral)?;
666 let code = letter as u8;
667 if !(63..=95).contains(&code) {
668 return Err(FendError::ExpectedALetterOrCode);
669 }
670 Some(if code == b'?' {
671 '\x7f'
672 } else {
673 (code - 64) as char
674 })
675 }
676 _ => return Err(FendError::UnknownBackslashEscapeSequence(next)),
677 };
678 if let Some(escaped_char) = escaped_char {
679 literal_string.push(escaped_char);
680 }
681 } else {
682 literal_string.push(ch);
683 }
684 }
685 let literal_length = literal_length.ok_or(FendError::UnterminatedStringLiteral)?;
686 let (_, remaining) = input.split_at(literal_length + 1);
687 Ok((Token::StringLiteral(literal_string.into()), remaining))
688 }
689
690 // parses a unit beginning with ' or "
parse_quote_unit(input: &str) -> (Token, &str)691 fn parse_quote_unit(input: &str) -> (Token, &str) {
692 let mut split_idx = 1;
693 if let Some(ch) = input.split_at(1).1.chars().next() {
694 if ch.is_alphabetic() {
695 split_idx += ch.len_utf8();
696 let mut prev = ch;
697 let (_, mut remaining) = input.split_at(split_idx);
698 while let Some(next) = remaining.chars().next() {
699 if !is_valid_in_ident(next, Some(prev)) {
700 break;
701 }
702 split_idx += next.len_utf8();
703 prev = next;
704 let (_, remaining2) = input.split_at(split_idx);
705 remaining = remaining2;
706 }
707 }
708 }
709 let (a, b) = input.split_at(split_idx);
710 (Token::Ident(Ident::new_string(a.to_string())), b)
711 }
712
713 pub(crate) struct Lexer<'a, 'b, I: Interrupt> {
714 input: &'a str,
715 // normally 0; 1 after backslash; 2 after ident after backslash
716 after_backslash_state: u8,
717 after_number_or_to: bool,
718 int: &'b I,
719 }
720
skip_whitespace_and_comments(input: &mut &str)721 fn skip_whitespace_and_comments(input: &mut &str) {
722 while !input.is_empty() {
723 if input.starts_with("# ") || input.starts_with("#!") {
724 if let Some(idx) = input.find('\n') {
725 let (_, remaining) = input.split_at(idx);
726 *input = remaining;
727 continue;
728 }
729 *input = "";
730 return;
731 } else if let Some(ch) = input.chars().next() {
732 if ch.is_whitespace() {
733 let (_, remaining) = input.split_at(ch.len_utf8());
734 *input = remaining;
735 continue;
736 }
737 }
738 break;
739 }
740 }
741
parse_date(input: &str) -> FResult<(Date, &str)>742 fn parse_date(input: &str) -> FResult<(Date, &str)> {
743 let (_, input) = input.split_at(1); // skip '@' symbol
744 let mut input2 = input;
745 let mut split_idx = 0;
746 for i in 0..3 {
747 let mut n = 0;
748 while matches!(input2.chars().next(), Some('0'..='9')) {
749 let (_, remaining) = input2.split_at(1);
750 input2 = remaining;
751 n += 1;
752 split_idx += 1;
753 }
754 if n == 0 {
755 return Err(FendError::ExpectedADateLiteral);
756 }
757 if i == 2 {
758 break;
759 }
760 if !input2.starts_with('-') {
761 return Err(FendError::ExpectedADateLiteral);
762 }
763 let (_, remaining) = input2.split_at(1);
764 input2 = remaining;
765 split_idx += 1;
766 }
767 let (date_str, result_remaining) = input.split_at(split_idx);
768 let res = Date::parse(date_str)?;
769 Ok((res, result_remaining))
770 }
771
772 impl<'a, 'b, I: Interrupt> Lexer<'a, 'b, I> {
next_token(&mut self) -> FResult<Option<Token>>773 fn next_token(&mut self) -> FResult<Option<Token>> {
774 skip_whitespace_and_comments(&mut self.input);
775 let (ch, following) = {
776 let mut chars = self.input.chars();
777 let ch = chars.next();
778 let following = chars.next();
779 (ch, following)
780 };
781 Ok(Some(match ch {
782 Some(ch) => {
783 if ch.is_ascii_digit()
784 || (ch == '.' && self.after_backslash_state == 0)
785 || (ch == 'd' && following.is_some() && following.unwrap().is_ascii_digit())
786 {
787 let (num, remaining) = parse_number(self.input, self.int)?;
788 self.input = remaining;
789 Token::Num(num)
790 } else if ch == '\'' || ch == '"' {
791 if self.after_number_or_to {
792 let (token, remaining) = parse_quote_unit(self.input);
793 self.input = remaining;
794 token
795 } else {
796 // normal string literal, with possible escape sequences
797 let (token, remaining) = parse_string_literal(self.input, ch)?;
798 self.input = remaining;
799 token
800 }
801 } else if ch == '@' {
802 // date literal, e.g. @1970-01-01
803 let (date, remaining) = parse_date(self.input)?;
804 self.input = remaining;
805 Token::Date(date)
806 } else if self.input.starts_with("#\"") {
807 // raw string literal
808 let (_, remaining) = self.input.split_at(2);
809 let literal_length = remaining
810 .match_indices("\"#")
811 .next()
812 .ok_or(FendError::UnterminatedStringLiteral)?
813 .0;
814 let (literal, remaining) = remaining.split_at(literal_length);
815 let (_terminator, remaining) = remaining.split_at(2);
816 self.input = remaining;
817 Token::StringLiteral(literal.to_string().into())
818 } else if is_valid_in_ident(ch, None) {
819 // dots aren't allowed in idents after a backslash
820 let (ident, remaining) =
821 parse_ident(self.input, self.after_backslash_state != 1)?;
822 self.input = remaining;
823 ident
824 } else {
825 let (_, remaining) = self.input.split_at(ch.len_utf8());
826 self.input = remaining;
827 parse_symbol(ch, &mut self.input)?
828 }
829 }
830 None => return Ok(None),
831 }))
832 }
833 }
834
835 impl<'a, I: Interrupt> Iterator for Lexer<'a, '_, I> {
836 type Item = FResult<Token>;
837
next(&mut self) -> Option<Self::Item>838 fn next(&mut self) -> Option<Self::Item> {
839 let res = match self.next_token() {
840 Err(e) => Some(Err(e)),
841 Ok(None) => None,
842 Ok(Some(t)) => Some(Ok(t)),
843 };
844 self.after_number_or_to = matches!(
845 res,
846 Some(Ok(Token::Num(_) | Token::Symbol(Symbol::UnitConversion)))
847 );
848 if matches!(res, Some(Ok(Token::Symbol(Symbol::Backslash)))) {
849 self.after_backslash_state = 1;
850 } else if self.after_backslash_state == 1 {
851 if let Some(Ok(Token::Ident(_))) = res {
852 self.after_backslash_state = 2;
853 } else {
854 self.after_backslash_state = 0;
855 }
856 } else {
857 self.after_backslash_state = 0;
858 }
859 res
860 }
861 }
862
lex<'a, 'b, I: Interrupt>(input: &'a str, int: &'b I) -> Lexer<'a, 'b, I>863 pub(crate) fn lex<'a, 'b, I: Interrupt>(input: &'a str, int: &'b I) -> Lexer<'a, 'b, I> {
864 Lexer {
865 input,
866 after_backslash_state: 0,
867 after_number_or_to: false,
868 int,
869 }
870 }
871