From f2ab30fdfef91cb2d44d3dd2b7c76d417fa8665d Mon Sep 17 00:00:00 2001
From: Gerald Bauer <gerald.bauer@gmail.com>
Date: Tue, 7 Feb 2023 17:24:43 +0100
Subject: [PATCH] up lexer

---
 solidity/NOTES.md                   | 55 ------------------
 solidity/lib/solidity/lexer.rb      | 45 ++++++---------
 solidity/lib/solidity/parser.rb     |  6 +-
 solidity/lib/solidity/version.rb    |  4 +-
 solidity/sandbox/test_lexer_ruby.rb | 90 +++++++++++++++++++++++++++++
 solidity/test/test_lexer.rb         | 37 ++++++++++++
 6 files changed, 150 insertions(+), 87 deletions(-)
 create mode 100644 solidity/sandbox/test_lexer_ruby.rb
 create mode 100644 solidity/test/test_lexer.rb

diff --git a/solidity/NOTES.md b/solidity/NOTES.md
index 7d1900f..9f4e7d4 100644
--- a/solidity/NOTES.md
+++ b/solidity/NOTES.md
@@ -55,58 +55,3 @@ https://pygments.org/docs/lexers/
 
 
 
-RubyVM::AbstractSyntaxTree.parse("puts('test', )", keep_tokens: true).tokens
-# =>
-# [[0, :tIDENTIFIER, "puts", [1, 0, 1, 4]],
-#  [1, :"(", "(", [1, 4, 1, 5]],
-#  [2, :tSTRING_BEG, "'", [1, 5, 1, 6]],
-#  [3, :tSTRING_CONTENT, "test", [1, 6, 1, 10]],
-#  [4, :tSTRING_END, "'", [1, 10, 1, 11]],
-#  [5, :",", ",", [1, 11, 1, 12]],
-#  [6, :tSP, " ", [1, 12, 1, 13]],
-#  [7, :")", ")", [1, 13, 1, 14]]]
-
-
-require 'ripper'
-require 'pp'
-
-code = <<STR
-
-
-5.times    do    |    x    |
-	puts x
-  puts "hello"
-  puts 'hello'       ## a comment here
-end
-
-
-STR
-
-puts code
-pp Ripper.lex(code)
-
-
-
-[[[1, 0], :on_ignored_nl, "\n", BEG],
- [[2, 0], :on_ignored_nl, "\n", BEG],
- [[3, 0], :on_int, "5", END],
- [[3, 1], :on_period, ".", DOT],
- [[3, 2], :on_ident, "times", ARG],
- [[3, 7], :on_sp, "    ", ARG],
- [[3, 11], :on_kw, "do", BEG],
- [[3, 13], :on_sp, "    ", BEG],
- [[3, 17], :on_op, "|", BEG|LABEL],
- [[3, 18], :on_sp, "    ", BEG|LABEL],
- [[3, 22], :on_ident, "x", ARG],
- [[3, 23], :on_sp, "    ", ARG],
- [[3, 27], :on_op, "|", BEG|LABEL],
- [[3, 28], :on_ignored_nl, "\n", BEG|LABEL],
- [[4, 0], :on_sp, "\t", BEG|LABEL],
- [[4, 1], :on_ident, "puts", CMDARG],
- [[4, 5], :on_sp, " ", CMDARG],
- [[4, 6], :on_ident, "x", END|LABEL],
- [[4, 7], :on_nl, "\n", BEG],
- [[5, 0], :on_kw, "end", END],
- [[5, 3], :on_nl, "\n", BEG],
- [[6, 0], :on_ignored_nl, "\n", BEG],
- [[7, 0], :on_ignored_nl, "\n", BEG]]
\ No newline at end of file
diff --git a/solidity/lib/solidity/lexer.rb b/solidity/lib/solidity/lexer.rb
index 2d390d9..6d6b18f 100644
--- a/solidity/lib/solidity/lexer.rb
+++ b/solidity/lib/solidity/lexer.rb
@@ -43,15 +43,14 @@ def initialize( txt )
   ## SingleQuotedStringCharacter
   ##   : ~['\r\n\\] | ('\\' .) ;
 
+  DOUBLE_QUOTE       = %r{"
+                           ( \\\\. | [^"\r\n\\] )*
+                          "}x
 
   SINGLE_QUOTE       = %r{'
-                           ( \\\\. | [^'] )*
+                           ( \\\\. | [^'\r\n\\] )*
                          '}x
 
-  DOUBLE_QUOTE       = %r{"
-                           ( \\\\. | [^"] )*
-                          "}x
-
 
   ## from the solidity grammar
   ##  > An identifier in solidity has to start with a letter,
@@ -76,40 +75,34 @@ def initialize( txt )
   ##
   ## COMMENT
   ##   : '/*' .*? '*/'  ;
-  ##
   ## LINE_COMMENT
   ##   : '//' ~[\r\n]* ;
 
+  COMMENT = %r{/\*
+                .*?
+                \*/}x
+
+  LINE_COMMENT = %r{//
+                     [^\r\n]*}x
 
    def tokenize
      t = []
      s = StringScanner.new( @txt )
 
      until s.eos?   ## loop until hitting end-of-string (file)
-       if s.check( /[ \t]*\/\*/ )
-          ## note: auto-slurp leading (optinal) spaces!!!! - why? why not?
-          comment = s.scan_until( /\*\// )
-          ## print "multi-line comment:"
-          ## pp comment
-          t << [:comment, comment.lstrip]
-       elsif s.check( /[ \t]*\/\// )
-          ## note: auto-slurp leading (optinal) spaces!!!!  - why? why not?
-          ## note: auto-remove newline AND trailing whitespace - why? why not?
-          comment = s.scan_until( /\n|$/ ).strip
-          ## print "comment:"
-          ## pp comment
-          t << [:comment, comment]
-       elsif s.scan( /[ \t]+/ )   ## one or more spaces
+       if s.scan( /[ \t]+/ )   ## one or more spaces
           ## note: (auto-)convert tab to space - why? why not?
           t << [:sp, s.matched.gsub( /[\t]/, ' ') ]
        elsif s.scan( /\r?\n/ )    ## check for (windows) carriage return (\r) - why? why not?
           t << [:nl, "\n" ]
-       elsif s.check( "'" )   ## single-quoted string
-          str = s.scan( SINGLE_QUOTE )
-          t << [:string, str]
-       elsif s.check( '"' )  ## double-quoted string
-          str = s.scan( DOUBLE_QUOTE )
-          t << [:string, str]
+       elsif s.scan( COMMENT )
+          t << [:comment, s.matched]
+       elsif s.scan( LINE_COMMENT )
+          t << [:comment, s.matched]
+       elsif s.scan( DOUBLE_QUOTE )  ## double-quoted string
+          t << [:string, s.matched]
+       elsif s.scan( SINGLE_QUOTE )  ## single-quoted string
+          t << [:string, s.matched]
        elsif s.scan( NAME )
           name = s.matched
           case name
diff --git a/solidity/lib/solidity/parser.rb b/solidity/lib/solidity/parser.rb
index b7925ba..7f2aa58 100644
--- a/solidity/lib/solidity/parser.rb
+++ b/solidity/lib/solidity/parser.rb
@@ -23,13 +23,11 @@ def _quick_pass_one
      lex = Lexer.new( @txt )
 
      until lex.eos?
-       while lex.peek == :sp do   ## note: do NOT skip newlines here; pass along blank/empty lines for now - why? why not?
-           lex.next
-       end
-
        case lex.peek
        when :comment  ## single or multi-line comment
            tree << [:comment, lex.next]
+           ## note:  if next token is newline - slurp / ignore
+           lex.next    if lex.peek == :nl
        when :pragma
             code = lex.scan_until( :';',
                                    include: true )
diff --git a/solidity/lib/solidity/version.rb b/solidity/lib/solidity/version.rb
index 84be610..fc30a5c 100644
--- a/solidity/lib/solidity/version.rb
+++ b/solidity/lib/solidity/version.rb
@@ -1,8 +1,8 @@
 
 module Solidity
   MAJOR = 0
-  MINOR = 1
-  PATCH = 5
+  MINOR = 2
+  PATCH = 0
   VERSION = [MAJOR,MINOR,PATCH].join('.')
 
   def self.version
diff --git a/solidity/sandbox/test_lexer_ruby.rb b/solidity/sandbox/test_lexer_ruby.rb
new file mode 100644
index 0000000..51b5464
--- /dev/null
+++ b/solidity/sandbox/test_lexer_ruby.rb
@@ -0,0 +1,90 @@
+###
+#  test ruby built-in lexers
+#   answer questions
+#    does end-of-line comment include newline in lexeme - yes/no?
+#
+#  -  [[6, 21], :on_comment, "## a comment here\n", END],
+
+
+
+require 'ripper'
+require 'pp'
+
+code = <<STR
+
+
+5.times    do    |    x    |
+	puts x
+  puts "hello"
+  puts 'hello'       ## a comment here
+  ## another comment here
+  ## another here
+
+  ## yet another here
+end
+
+
+STR
+
+
+puts code
+pp Ripper.lex(code)
+
+
+puts code
+## unknown keyword: :keep_tokens
+## note: requires ruby 3.2+ or such - double check!!!!
+pp RubyVM::AbstractSyntaxTree.parse( code,
+                                     keep_tokens: true).tokens
+
+# =>
+# [[0, :tIDENTIFIER, "puts", [1, 0, 1, 4]],
+#  [1, :"(", "(", [1, 4, 1, 5]],
+#  [2, :tSTRING_BEG, "'", [1, 5, 1, 6]],
+#  [3, :tSTRING_CONTENT, "test", [1, 6, 1, 10]],
+#  [4, :tSTRING_END, "'", [1, 10, 1, 11]],
+#  [5, :",", ",", [1, 11, 1, 12]],
+#  [6, :tSP, " ", [1, 12, 1, 13]],
+#  [7, :")", ")", [1, 13, 1, 14]]]
+
+
+__END__
+
+[[[1, 0], :on_ignored_nl, "\n", BEG],
+ [[2, 0], :on_ignored_nl, "\n", BEG],
+ [[3, 0], :on_int, "5", END],
+ [[3, 1], :on_period, ".", DOT],
+ [[3, 2], :on_ident, "times", ARG],
+ [[3, 7], :on_sp, "    ", ARG],
+ [[3, 11], :on_kw, "do", BEG],
+ [[3, 13], :on_sp, "    ", BEG],
+ [[3, 17], :on_op, "|", BEG|LABEL],
+ [[3, 18], :on_sp, "    ", BEG|LABEL],
+ [[3, 22], :on_ident, "x", ARG],
+ [[3, 23], :on_sp, "    ", ARG],
+ [[3, 27], :on_op, "|", BEG|LABEL],
+ [[3, 28], :on_ignored_nl, "\n", BEG|LABEL],
+ [[4, 0], :on_sp, "\t", BEG|LABEL],
+ [[4, 1], :on_ident, "puts", CMDARG],
+ [[4, 5], :on_sp, " ", CMDARG],
+ [[4, 6], :on_ident, "x", END|LABEL],
+ [[4, 7], :on_nl, "\n", BEG],
+ [[5, 0], :on_sp, "  ", BEG],
+ [[5, 2], :on_ident, "puts", CMDARG],
+ [[5, 6], :on_sp, " ", CMDARG],
+ [[5, 7], :on_tstring_beg, "\"", CMDARG],
+ [[5, 8], :on_tstring_content, "hello", CMDARG],
+ [[5, 13], :on_tstring_end, "\"", END],
+ [[5, 14], :on_nl, "\n", BEG],
+ [[6, 0], :on_sp, "  ", BEG],
+ [[6, 2], :on_ident, "puts", CMDARG],
+ [[6, 6], :on_sp, " ", CMDARG],
+ [[6, 7], :on_tstring_beg, "'", CMDARG],
+ [[6, 8], :on_tstring_content, "hello", CMDARG],
+ [[6, 13], :on_tstring_end, "'", END],
+ [[6, 14], :on_sp, "       ", END],
+ [[6, 21], :on_comment, "## a comment here\n", END],
+ [[7, 0], :on_kw, "end", END],
+ [[7, 3], :on_nl, "\n", BEG],
+ [[8, 0], :on_ignored_nl, "\n", BEG],
+ [[9, 0], :on_ignored_nl, "\n", BEG]]
\ No newline at end of file
diff --git a/solidity/test/test_lexer.rb b/solidity/test/test_lexer.rb
new file mode 100644
index 0000000..1b69fa8
--- /dev/null
+++ b/solidity/test/test_lexer.rb
@@ -0,0 +1,37 @@
+##
+#  to run use
+#     ruby -I ./lib -I ./test test/test_lexer.rb
+
+
+require 'helper'
+
+
+
+class TestLexer < MiniTest::Test
+
+def _untokenize( tokens )
+  buf = String.new('')
+  tokens.each do |t|
+    buf <<  (t.is_a?( String ) ? t : t[1])
+
+    ## dump some token types
+    pp t    if [:comment, :string].include?( t[0] )
+  end
+  buf
+end
+
+
+def test_contracts
+    ['contract1',
+     'contract2',
+     'contract3'].each do |name, exp|
+      path = "./contracts/#{name}.sol"
+      lexer = Solidity::Lexer.read( path )
+
+      tokens = lexer.tokenize
+
+      txt = read_text( path )
+      assert_equal txt, _untokenize( tokens )
+    end
+end
+end   ## class TestLexer