Skip to content

Commit

Permalink
✨ fix the macro var expr lexing
Browse files Browse the repository at this point in the history
📝 update unrealeased changelog
✅ split python samples & snaps, to decouple tests across versions
  • Loading branch information
mishamsk committed Dec 8, 2024
1 parent 1c260b0 commit ac3e553
Show file tree
Hide file tree
Showing 22 changed files with 1,962 additions and 774 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
# Explicitly declare test assets used in snapshot tests to use LF line endings
# to avoid issues with snapshot tests on Windows.
crates/sas-lexer/src/lexer/tests/samples/*.sas text eol=lf
tests/samples/*.sas text eol=lf
12 changes: 8 additions & 4 deletions CHANGELOG-RUST.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
## [Unreleased] - ReleaseDate

### Added
-
- ✨ Added two public iterator API's on `TokenizedBuffer`:
- `iter_tokens` - returns an iterator over token indexes
- `iter_tokens_infos` - returns an iterator over token infos
- It uses the new public type: `TokenInfoIter`
- ✨ New buffer API to get fully resolved token text: `get_token_resolved_text` instead of manually checking for string payload

### Changed
-
- ✨ Made `TokenInfo` public with read-only getters

### Deprecated
-

### Removed
-
- 🔥 remove IntoIterator impl for TokenizedBuffer

### Fixed
-
- 💥 Reworked macro var expression lexing, to properly account for trailing terminating dots. This is a breaking change, as the token types emitted by the lexer have changed. Now instead of one token `MacroVarExpr` for the entire expression, a sequence of `MacroVarResolve`, `MacroString` and optional `MacroVarTerm` are emitted.

### Security
-
Expand Down
4 changes: 3 additions & 1 deletion crates/sas-lexer/src/lexer/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ pub enum Payload {
None,
/// Stores parsed integer value. We do not parse -N as a single token
/// so it is unsigned.
/// Also stores inverterted precedence for macro var resolve operator (&),
/// where number means the log2n of amp count.
Integer(u64),
/// Stores parsed float value
Float(f64),
Expand Down Expand Up @@ -1174,7 +1176,7 @@ mod tests {
"Hello, world!\n"
);

// This one should be the string literal, exlcude the quotes
// This one should be the string literal, exclude the quotes
assert_eq!(
detached
.get_token_resolved_text(token2, &source)
Expand Down
34 changes: 27 additions & 7 deletions crates/sas-lexer/src/lexer/macro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use unicode_ident::is_xid_continue;
use super::{
cursor::Cursor,
error::ErrorKind,
sas_lang::is_valid_sas_name_start,
sas_lang::{is_valid_sas_name_continue, is_valid_unicode_sas_name_start},
token_type::{
parse_macro_keyword, TokenType, TokenTypeMacroCallOrStat,
MACRO_QUOTE_CALL_TOKEN_TYPE_RANGE, MACRO_STAT_TOKEN_TYPE_RANGE, MAX_MKEYWORDS_LEN,
Expand All @@ -31,12 +31,28 @@ pub(super) fn is_macro_amp<I: Iterator<Item = char>>(mut chars: I) -> (bool, u32
amp_count += 1;
continue;
}
Some(c) if is_valid_sas_name_start(c) => return (true, amp_count),
Some(c) if is_valid_unicode_sas_name_start(c) => return (true, amp_count),
_ => return (false, amp_count),
}
}
}

pub(super) fn get_macro_resolve_ops_from_amps(amp_count: u32) -> Vec<u8> {
// The way SAS works, effectively (but how SAS engine process it in reality)
// the N consecutive amps become K resolve operations. From right to left
// each operation is some power of 2, starting from 0 (1 amp).
// E.g `&&&var._` is `&&((&var.)_)` and not `&(&(&var.))_`

// So we just need to collect bit positions of the amps to get the
// list of resolve operations with their corresponding powers of 2.
// These powers correspond to reverse precedence of the resolve operation.

(0..32u8)
.rev()
.filter(|i| (amp_count & (1 << i)) != 0)
.collect()
}

#[inline]
pub(super) fn is_macro_eval_quotable_op(c: char) -> bool {
// Experimentally shown to work! (ignores the %)
Expand All @@ -52,7 +68,9 @@ pub(super) fn is_macro_percent(follow_char: char, in_eval_context: bool) -> bool
match follow_char {
// Macro comment
'*' => true,
c if is_valid_sas_name_start(c) || (in_eval_context && is_macro_eval_quotable_op(c)) => {
c if is_valid_unicode_sas_name_start(c)
|| (in_eval_context && is_macro_eval_quotable_op(c)) =>
{
true
}
_ => false,
Expand Down Expand Up @@ -158,7 +176,7 @@ pub(super) fn lex_macro_call_stat_or_label(
cursor: &mut Cursor,
) -> Result<(TokenTypeMacroCallOrStat, u32), ErrorKind> {
debug_assert!(
cursor.peek().is_some_and(is_valid_sas_name_start),
cursor.peek().is_some_and(is_valid_unicode_sas_name_start),
"Unexpected first character in the cursor: {:?}",
cursor.peek()
);
Expand All @@ -170,14 +188,16 @@ pub(super) fn lex_macro_call_stat_or_label(
// Start tracking whether the identifier is ASCII
// It is necessary, as we need to upper case the identifier if it is ASCII
// for checking against statement names, and if it is not ASCII,
// we know it is not a keyword and can skip the test right away
// we know it is not a keyword and can skip the test right away.
// And the only reason we even bother with unicode is because
// apparently unicode macro labels actually work in SAS despite the docs...
let mut is_ascii = true;

// Eat the identifier. We can safely use `is_xid_continue` because the caller
// already checked that the first character is a valid start of an identifier
cursor.eat_while(|c| {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
is_valid_sas_name_continue(c)
} else if is_xid_continue(c) {
is_ascii = false;
true
Expand Down Expand Up @@ -298,7 +318,7 @@ pub(super) fn is_macro_stat(input: &str) -> bool {
let pending_ident_len = input[1..]
.find(|c: char| {
if c.is_ascii() {
!matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
!is_valid_sas_name_continue(c)
} else if is_xid_continue(c) {
is_ascii = false;
false
Expand Down
Loading

0 comments on commit ac3e553

Please sign in to comment.