From 9ae150ac03b26b05e0db0bbf1b96b93095b10e57 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 12:15:10 -0300 Subject: [PATCH 01/20] =?UTF-8?q?=F0=9F=9A=A7=20Change=20automata=20state?= =?UTF-8?q?=20to=20be=20string=20instead=20of=20numeric?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change is to make all automata states more readable --- src/filesystem.py | 8 +-- src/main.py | 142 +++++++++++++++++++++++----------------------- 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/src/filesystem.py b/src/filesystem.py index 697f275..d9182c1 100644 --- a/src/filesystem.py +++ b/src/filesystem.py @@ -15,7 +15,7 @@ class Token(NamedTuple): value: str; class ResTokenList(NamedTuple): - lastState: int; + lastState: str; lastStartTokenIndex: int; tokenStartLine: int; tokenOverflow: str; @@ -31,11 +31,11 @@ def listDirFiles(dirPath: str) -> 'list[str]': onlyfiles = [f for f in listdir(dirPath) if isfile(join(dirPath, f))] return onlyfiles; -def readFileLines(opened_file: TextIOWrapper, on_line: Callable[[str, int, int, str], ResTokenList]): +def readFileLines(opened_file: TextIOWrapper, on_line: Callable[[str, int, str, str], ResTokenList]): # Lê cada linha do arquivo e passa para a função de callback on_line response: list[Token] = []; - currentState: ResTokenList = ResTokenList(0, 0, 0, '', []); + currentState: ResTokenList = ResTokenList('0', 0, 0, '', []); # Loop through each line via file handler for count, line in enumerate(opened_file): @@ -45,7 +45,7 @@ def readFileLines(opened_file: TextIOWrapper, on_line: Callable[[str, int, int, response = response + res.tokenList; currentState = res; - if (currentState.lastState == 8): + if (currentState.lastState == '8'): t = Token('CoMF', currentState.tokenStartLine, currentState.lastStartTokenIndex, 0, currentState.tokenOverflow); response.append(t); diff --git a/src/main.py b/src/main.py index 8565c32..b02df68 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ DIGITO = re.compile(r'/\d/g'); -def onReadLine(line: str, lineNumber: int, initialState: int, overflow: str) -> ResTokenList: +def onReadLine(line: str, lineNumber: int, initialState: str, overflow: str) -> ResTokenList: # Se a linha estiver vaxia não fax nada tokens = findTokensInString(line, lineNumber, initialState, overflow); return tokens; @@ -34,28 +34,28 @@ def hasNonASCII(s: str): return True; return False -def findTokensInString(line: str, lineCount: int, initialState: int, overflow: str) -> ResTokenList: +def findTokensInString(line: str, lineCount: int, initialState: str, overflow: str) -> ResTokenList: lineLength: int = len(line); tokenStartIndex: int = 0; currentIndex: int = 0; - currentState: int = initialState; + currentState: str = initialState; tokensFoundInThisLine: list[Token] = []; - tokenOverflow: str = overflow if initialState == 8 else ''; + tokenOverflow: str = overflow if initialState == '8' else ''; exitLoop = False; while (not exitLoop and currentIndex < lineLength): - if (currentState == 0): + if (currentState == '0'): if (line[currentIndex] == '/'): currentIndex = currentIndex + 1; - currentState = 2; + currentState = '2'; elif (isDelimiter(line[currentIndex])): t = Token('DEL', lineCount, currentIndex, currentIndex + 1, line[currentIndex:currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; elif (re.match( r'[a-zA-Z]+', line[currentIndex])): - currentState = 5; + currentState = '5'; tokenStartIndex = currentIndex; if(currentIndex + 1 >= lineLength): atEndOfLine = line[tokenStartIndex:] @@ -64,60 +64,60 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s else: t = Token('IDE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: currentIndex = currentIndex + 1; elif (line[currentIndex] == '"'): tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; - currentState = 6; + currentState = '6'; elif (line[currentIndex] == '&'): currentIndex = currentIndex + 1; - currentState = 14; + currentState = '14'; elif (line[currentIndex] == '|'): currentIndex = currentIndex + 1; - currentState = 16; + currentState = '16'; elif (line[currentIndex] == '!'): currentIndex = currentIndex + 1; - currentState = 12; + currentState = '12'; elif (line[currentIndex] == '=' or line[currentIndex] == '<' or line[currentIndex] == '>'): currentIndex = currentIndex + 1; - currentState = 18; + currentState = '18'; elif (line[currentIndex] == '+'): currentIndex = currentIndex + 1; - currentState = 19; + currentState = '19'; elif (line[currentIndex] == '*'): t = Token('ART', lineCount, currentIndex, currentIndex + 1, line[currentIndex:currentIndex + 1]); tokensFoundInThisLine.append(t); currentIndex = currentIndex + 1; - currentState = 0; + currentState = '0'; elif (line[currentIndex] == '-'): if (line[currentIndex + 1] == '-'): t = Token('ART', lineCount, currentIndex, currentIndex + 2, line[currentIndex:currentIndex + 2]); tokensFoundInThisLine.append(t); currentIndex = currentIndex + 2; - currentState = 0; + currentState = '0'; else: tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; - currentState = 20; + currentState = '20'; elif (re.match(r'\d', line[currentIndex])): tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; - currentState = 21; + currentState = '21'; elif(line[currentIndex] == ' ' or line[currentIndex] == '\t' or line[currentIndex] == '\n'): currentIndex = currentIndex + 1; - currentState = 0; + currentState = '0'; else: mlkmk = line[currentIndex]; t = Token('TMF', lineCount, currentIndex, currentIndex, mlkmk); tokensFoundInThisLine.append(t); currentIndex = currentIndex + 1; - currentState = 0; - elif(currentState == 2): + currentState = '0'; + elif(currentState == '2'): if (line[currentIndex] == '*'): - currentState = 8; + currentState = '8'; tokenStartIndex = currentIndex - 1; # considerando a barra anterior currentIndex = currentIndex + 1; elif (line[currentIndex] == '/'): @@ -125,13 +125,13 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s #tokensFoundInThisLine.append(t); exitLoop = True; currentIndex = lineLength - 1; - currentState = 0; + currentState = '0'; else: t = Token('ART', lineCount, currentIndex -1, currentIndex, line[currentIndex - 1: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; - elif(currentState == 5): + elif(currentState == '5'): if(currentIndex + 1 >= lineLength): atEndOfLine = line[tokenStartIndex:] if (isReserved(line[tokenStartIndex: currentIndex])): @@ -139,7 +139,7 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s else: t = Token('IDE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; tokenStartIndex = 0; elif (line[currentIndex] == '_' or re.match(r'[a-zA-Z]+', line[currentIndex]) or re.match(r'\d', line[currentIndex])): currentIndex = currentIndex + 1; @@ -150,9 +150,9 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s else: t = Token('IDE', lineCount, tokenStartIndex, currentIndex, ideToken); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; tokenStartIndex = 0; - elif(currentState == 6): + elif(currentState == '6'): if (line[currentIndex] == '"'): stoken = line[tokenStartIndex: currentIndex + 1]; if (hasNonASCII(stoken)): @@ -160,20 +160,20 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s else: t = Token('CAC', lineCount, tokenStartIndex, currentIndex, stoken); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; tokenStartIndex = 0; currentIndex = currentIndex + 1; elif(line[currentIndex] == '\n'): t = Token('CMF', lineCount, tokenStartIndex, lineLength, line[tokenStartIndex: lineLength - 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; tokenStartIndex = 0; currentIndex = currentIndex + 1; else: currentIndex = currentIndex + 1; - elif(currentState == 8): + elif(currentState == '8'): if (line[currentIndex] == '*'): - currentState = 10; + currentState = '10'; currentIndex = currentIndex + 1; elif (line[currentIndex] == '\n' or currentIndex == lineLength -1): l = line.replace('\n', ''); @@ -182,77 +182,77 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s currentIndex = currentIndex + 1; else: currentIndex = currentIndex + 1; - elif(currentState == 10): + elif(currentState == '10'): if (line[currentIndex] == '/'): t = Token('COM', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); #tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: - currentState = 8; + currentState = '8'; currentIndex = currentIndex + 1; - elif(currentState == 14): + elif(currentState == '14'): if (line[currentIndex] == '&'): t = Token('LOG', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: t = Token('TMF', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex]); tokensFoundInThisLine.append(t); currentIndex = currentIndex + 1; - currentState = 0; - elif(currentState == 16): + currentState = '0'; + elif(currentState == '16'): if (line[currentIndex] == '|'): t = Token('LOG', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: t = Token('TMF', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex]); tokensFoundInThisLine.append(t); currentIndex = currentIndex + 1; - currentState = 0; - elif(currentState == 12): + currentState = '0'; + elif(currentState == '12'): if (line[currentIndex] == '='): t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: t = Token('LOG', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 18): + currentState = '0'; + elif(currentState == '18'): if (line[currentIndex] == '='): t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: t = Token('REL', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 19): + currentState = '0'; + elif(currentState == '19'): if (line[currentIndex] == '+'): t = Token('ART', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; currentIndex = currentIndex + 1; else: t = Token('ART', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 20): + currentState = '0'; + elif(currentState == '20'): if (re.match(r'\d', line[currentIndex])): - currentState = 21; + currentState = '21'; elif (line[currentIndex] == ' '): currentIndex = currentIndex + 1; else: t = Token('ART', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 21): + currentState = '0'; + elif(currentState == '21'): if(currentIndex + 1 >= lineLength): l = line[tokenStartIndex:] if (l[len(l) - 1] == '\n'): @@ -264,54 +264,54 @@ def findTokensInString(line: str, lineCount: int, initialState: int, overflow: s elif (re.match(r'\d', line[currentIndex])): currentIndex = currentIndex + 1; elif (line[currentIndex] == '.'): - currentState = 22; + currentState = '22'; currentIndex = currentIndex + 1; else: t = Token('NRO', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); tokensFoundInThisLine.append(t); if (line[currentIndex] == ' ' or line[currentIndex] == '-' or line[currentIndex] == '\t'): - currentState = 24; + currentState = '24'; else: - currentState = 0; - elif(currentState == 22): + currentState = '0'; + elif(currentState == '22'): if (re.match(r'\d', line[currentIndex])): currentIndex = currentIndex + 1; - currentState = 23; + currentState = '23'; else: t = Token('NMF', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 23): + currentState = '0'; + elif(currentState == '23'): if (re.match(r'\d', line[currentIndex])): currentIndex = currentIndex + 1; else: t = Token('NRO', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); tokensFoundInThisLine.append(t); - currentState = 0; - elif(currentState == 24): + currentState = '0'; + elif(currentState == '24'): if (line[currentIndex] == ' ' or line[currentIndex] == '\t'): currentIndex = currentIndex + 1; elif (line[currentIndex] == '-'): tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; - currentState = 25; + currentState = '25'; else: - currentState = 0; - elif(currentState == 25): + currentState = '0'; + elif(currentState == '25'): if (line[currentIndex] == ' ' or line[currentIndex] == '\t'): currentIndex = currentIndex + 1; elif (re.match(r'\d', line[currentIndex])): t = Token('ART', lineCount, tokenStartIndex, tokenStartIndex + 1, line[tokenStartIndex: tokenStartIndex + 1]); tokensFoundInThisLine.append(t); - currentState = 0; + currentState = '0'; else: currentIndex = tokenStartIndex; - currentState = 0; + currentState = '0'; else: exitLoop = True; - if (currentState != 8): - currentState = 0; + if (currentState != '8'): + currentState = '0'; return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); From e48978626569ed2ec3cdbb6540d7ddb03f1a4fd0 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 14:38:30 -0300 Subject: [PATCH 02/20] =?UTF-8?q?=F0=9F=9A=A7=20Separating=20the=20initial?= =?UTF-8?q?=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 112 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 23 deletions(-) diff --git a/src/main.py b/src/main.py index b02df68..657c6aa 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,5 @@ import re +from typing import Callable from filesystem import ResTokenList, Token, TokenListPerFile, listDirFiles, readFileLines LETRA = re.compile(r'/[a-zA-Z]+/g'); @@ -34,6 +35,67 @@ def hasNonASCII(s: str): return True; return False +Automata = Callable[[str, str], str]; + +def IdendifierAutomata(state: str, input: str): + isValid = re.match(r'[a-zA-Z]+', input) or re.match(r'\d', input); + if (state == 'IdentifierFinal'): + return 'IdentifierFinal'; + if (state == 'Identifier' and isValid): + return 'Identifier' + elif (state == 'Identifier' and not isValid): + return 'IdentifierFinal' + return state + 'Error:_' + input; + +def DelimiterAutomata(state: str, input: str): + if (state == 'DelimiterFinal'): + return 'DelimiterFinal'; + if (state == 'Delimiter'): + return 'DelimiterFinal'; + return state + 'Error:_' + input; + +def ErrorAutomata(state: str, input: str): + return state + 'Error:_' + input; + +def getNextState(state: str, input: str) -> str: + if (not state == 'InitialState'): + automata: Automata = findApropriateAutomata(state); + return automata(state, input); + if (input == '/'): + return 'PossibleComment'; + elif (isDelimiter(input)): + return 'Delimiter' + elif (re.match( r'[a-zA-Z]+', input)): # Se for uma letra + return 'Identifier' + return '0'; + +def isFinalState(state: str): + finalStates = {'DelimiterFinal'}; + if state in finalStates: + return True; + return False; + +def getTokenType(state: str): + stateToTokenType = { + 'Delimiter': 'DEL', + } + return stateToTokenType.get(state, 'None'); + +def findApropriateAutomata(state: str) -> Automata: + if (state == 'Identifier'): + return IdendifierAutomata; + elif (state == 'Delimiter'): + return DelimiterAutomata; + return ErrorAutomata; + +def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): + tokenType = getTokenType(state); + tokenText = lineText[tokenStartIndex:tokenEndIndex]; + if (tokenType == 'IDE'): + tokenType = 'PRE' if isReserved(tokenText) else 'IDE'; + return Token(tokenType, lineNumber, tokenStartIndex, tokenEndIndex, tokenText); + + def findTokensInString(line: str, lineCount: int, initialState: str, overflow: str) -> ResTokenList: lineLength: int = len(line); tokenStartIndex: int = 0; @@ -45,30 +107,34 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s exitLoop = False; while (not exitLoop and currentIndex < lineLength): + # Se ainda no estado inicial, considere o caractere atual como inicio do token + if (currentState == 'InitialState'): + tokenStartIndex = currentIndex; + + # Caractere atual + character: str = line[currentIndex]; + + # Proximo estado, dado o caractere lido + nextState: str = getNextState(currentState, character); + + # Se for um estado final, gere um token + if (isFinalState(nextState)): + token = generateToken(currentState, lineCount, line, tokenStartIndex, currentIndex); + tokensFoundInThisLine.append(token); # Apos salvar o token + currentState = 'InitialState'; # Volte para o estado inicial + + # Do contrario, leia o proximo caractere + else: + # Se a linha termina e o estado não é final, decrementa o index + # para chegar num estado final na proxima iteração + if (currentIndex + 1 >= lineLength): + currentIndex = currentIndex - 1; + + currentState = nextState # Define o priximo estado + currentIndex = currentIndex + 1; + if (currentState == '0'): - if (line[currentIndex] == '/'): - currentIndex = currentIndex + 1; - currentState = '2'; - elif (isDelimiter(line[currentIndex])): - t = Token('DEL', lineCount, currentIndex, currentIndex + 1, line[currentIndex:currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - elif (re.match( r'[a-zA-Z]+', line[currentIndex])): - currentState = '5'; - tokenStartIndex = currentIndex; - if(currentIndex + 1 >= lineLength): - atEndOfLine = line[tokenStartIndex:] - if (isReserved(line[tokenStartIndex: currentIndex])): - t = Token('PRE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); - else: - t = Token('IDE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - currentIndex = currentIndex + 1; - elif (line[currentIndex] == '"'): + if (line[currentIndex] == '"'): tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; currentState = '6'; From 8bc220b5c7f68e5c2b7a9620185bba07f612e2b5 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 14:58:41 -0300 Subject: [PATCH 03/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20string=20au?= =?UTF-8?q?tomata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 63 ++++++++++++++--------------------------------------- 1 file changed, 16 insertions(+), 47 deletions(-) diff --git a/src/main.py b/src/main.py index 657c6aa..291d844 100644 --- a/src/main.py +++ b/src/main.py @@ -57,6 +57,13 @@ def DelimiterAutomata(state: str, input: str): def ErrorAutomata(state: str, input: str): return state + 'Error:_' + input; +def StringAutomata(state: str, input: str): + if (state == 'String' and input == '"'): + return 'StringFinal'; + if (state == 'String' and input == '\n'): + return 'MalformedString'; + return 'String'; + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -67,17 +74,21 @@ def getNextState(state: str, input: str) -> str: return 'Delimiter' elif (re.match( r'[a-zA-Z]+', input)): # Se for uma letra return 'Identifier' + elif (input == '"'): + return 'String'; return '0'; def isFinalState(state: str): - finalStates = {'DelimiterFinal'}; + finalStates = {'DelimiterFinal', 'MalformedString', 'StringFinal'}; if state in finalStates: return True; return False; def getTokenType(state: str): stateToTokenType = { - 'Delimiter': 'DEL', + 'DelimiterFinal': 'DEL', + 'MalformedString': 'CMF', + 'StringFinal': 'CAC', } return stateToTokenType.get(state, 'None'); @@ -93,6 +104,8 @@ def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: i tokenText = lineText[tokenStartIndex:tokenEndIndex]; if (tokenType == 'IDE'): tokenType = 'PRE' if isReserved(tokenText) else 'IDE'; + if (tokenType == 'CAC'): + tokenType = 'CMF' if hasNonASCII(tokenText) else 'CAC'; return Token(tokenType, lineNumber, tokenStartIndex, tokenEndIndex, tokenText); @@ -134,11 +147,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '"'): - tokenStartIndex = currentIndex; - currentIndex = currentIndex + 1; - currentState = '6'; - elif (line[currentIndex] == '&'): + if (line[currentIndex] == '&'): currentIndex = currentIndex + 1; currentState = '14'; elif (line[currentIndex] == '|'): @@ -197,46 +206,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s tokensFoundInThisLine.append(t); currentState = '0'; currentIndex = currentIndex + 1; - elif(currentState == '5'): - if(currentIndex + 1 >= lineLength): - atEndOfLine = line[tokenStartIndex:] - if (isReserved(line[tokenStartIndex: currentIndex])): - t = Token('PRE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); - else: - t = Token('IDE', lineCount, tokenStartIndex, currentIndex, atEndOfLine); - tokensFoundInThisLine.append(t); - currentState = '0'; - tokenStartIndex = 0; - elif (line[currentIndex] == '_' or re.match(r'[a-zA-Z]+', line[currentIndex]) or re.match(r'\d', line[currentIndex])): - currentIndex = currentIndex + 1; - else: - ideToken = line[tokenStartIndex: currentIndex] - if (isReserved(line[tokenStartIndex: currentIndex])): - t = Token('PRE', lineCount, tokenStartIndex, currentIndex, ideToken); - else: - t = Token('IDE', lineCount, tokenStartIndex, currentIndex, ideToken); - tokensFoundInThisLine.append(t); - currentState = '0'; - tokenStartIndex = 0; - elif(currentState == '6'): - if (line[currentIndex] == '"'): - stoken = line[tokenStartIndex: currentIndex + 1]; - if (hasNonASCII(stoken)): - t = Token('CMF', lineCount, tokenStartIndex, currentIndex, stoken); - else: - t = Token('CAC', lineCount, tokenStartIndex, currentIndex, stoken); - tokensFoundInThisLine.append(t); - currentState = '0'; - tokenStartIndex = 0; - currentIndex = currentIndex + 1; - elif(line[currentIndex] == '\n'): - t = Token('CMF', lineCount, tokenStartIndex, lineLength, line[tokenStartIndex: lineLength - 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - tokenStartIndex = 0; - currentIndex = currentIndex + 1; - else: - currentIndex = currentIndex + 1; elif(currentState == '8'): if (line[currentIndex] == '*'): currentState = '10'; From 2e77568f96bc1cd3638a0434b40750a18552473d Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 15:46:37 -0300 Subject: [PATCH 04/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20comment=20a?= =?UTF-8?q?utomata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 58 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/src/main.py b/src/main.py index 291d844..3f50be0 100644 --- a/src/main.py +++ b/src/main.py @@ -64,6 +64,23 @@ def StringAutomata(state: str, input: str): return 'MalformedString'; return 'String'; +def CommentAutomata(state: str, input: str): + if (state == 'PossibleComment' and not (input == '*' or input == '/')): + return 'ArithmeticFinal' + if (state == 'PossibleComment' and input == '*'): + return 'BlockComment' + if (state == 'PossibleComment' and input == '/'): + return 'LineComment' + if (state == 'LineComment' and input == '\n'): + return 'LineCommentFinal' + if (state == 'BlockComment' and input == '\n'): + return 'BlockCommentOverflow' + if (state == 'LineComment'): + return 'LineComment' + if (state == 'BlockComment'): + return 'BlockComment' + return state + 'Error:_' + input; + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -79,7 +96,14 @@ def getNextState(state: str, input: str) -> str: return '0'; def isFinalState(state: str): - finalStates = {'DelimiterFinal', 'MalformedString', 'StringFinal'}; + finalStates = { + 'DelimiterFinal', + 'MalformedString', + 'StringFinal', + 'LineCommentFinal', + 'BlockCommentFinal', + 'ArithmeticFinal', + }; if state in finalStates: return True; return False; @@ -89,14 +113,24 @@ def getTokenType(state: str): 'DelimiterFinal': 'DEL', 'MalformedString': 'CMF', 'StringFinal': 'CAC', + 'LineCommentFinal': 'COM', + 'BlockCommentFinal': 'CMB', + 'ArithmeticFinal': 'ART' } return stateToTokenType.get(state, 'None'); +def toFinalState(state: str): + return state + 'Final'; + def findApropriateAutomata(state: str) -> Automata: - if (state == 'Identifier'): + if ('Identifier' in state): return IdendifierAutomata; - elif (state == 'Delimiter'): + elif ('Delimiter' in state): return DelimiterAutomata; + elif ('String' in state): + return StringAutomata; + elif ('Comment' in state): + return CommentAutomata; return ErrorAutomata; def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): @@ -141,6 +175,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s # Se a linha termina e o estado não é final, decrementa o index # para chegar num estado final na proxima iteração if (currentIndex + 1 >= lineLength): + nextState = toFinalState(nextState); # Define o estado como final currentIndex = currentIndex - 1; currentState = nextState # Define o priximo estado @@ -190,22 +225,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s tokensFoundInThisLine.append(t); currentIndex = currentIndex + 1; currentState = '0'; - elif(currentState == '2'): - if (line[currentIndex] == '*'): - currentState = '8'; - tokenStartIndex = currentIndex - 1; # considerando a barra anterior - currentIndex = currentIndex + 1; - elif (line[currentIndex] == '/'): - t = Token('COM', lineCount, currentIndex, lineLength - 1, line[currentIndex - 1: -1]); - #tokensFoundInThisLine.append(t); - exitLoop = True; - currentIndex = lineLength - 1; - currentState = '0'; - else: - t = Token('ART', lineCount, currentIndex -1, currentIndex, line[currentIndex - 1: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; + elif(currentState == '8'): if (line[currentIndex] == '*'): currentState = '10'; From ada568a4f71d25e23a33146e4323549383c84813 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 16:21:02 -0300 Subject: [PATCH 05/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20multiline?= =?UTF-8?q?=20block=20comment=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/src/main.py b/src/main.py index 3f50be0..e8e84bd 100644 --- a/src/main.py +++ b/src/main.py @@ -77,6 +77,12 @@ def CommentAutomata(state: str, input: str): return 'BlockCommentOverflow' if (state == 'LineComment'): return 'LineComment' + if (state == 'BlockComment' and input == '*'): + return 'ClosingBlockComment' + if (state == 'ClosingBlockComment' and input == '/'): + return 'BlockCommentFinal' + if (state == 'ClosingBlockComment' and not input == '/'): + return 'BlockComment' if (state == 'BlockComment'): return 'BlockComment' return state + 'Error:_' + input; @@ -178,6 +184,11 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s nextState = toFinalState(nextState); # Define o estado como final currentIndex = currentIndex - 1; + # Se há um comentario de bloco multilinha + if (currentState == 'BlockCommentOverflow'): + tokenOverflow = line[tokenStartIndex:].replace('\n', ''); + exitLoop = True; + currentState = nextState # Define o priximo estado currentIndex = currentIndex + 1; @@ -226,26 +237,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; currentState = '0'; - elif(currentState == '8'): - if (line[currentIndex] == '*'): - currentState = '10'; - currentIndex = currentIndex + 1; - elif (line[currentIndex] == '\n' or currentIndex == lineLength -1): - l = line.replace('\n', ''); - if (line != '\n'): - tokenOverflow = tokenOverflow + l[tokenStartIndex: len(l)]; - currentIndex = currentIndex + 1; - else: - currentIndex = currentIndex + 1; - elif(currentState == '10'): - if (line[currentIndex] == '/'): - t = Token('COM', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); - #tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - currentState = '8'; - currentIndex = currentIndex + 1; elif(currentState == '14'): if (line[currentIndex] == '&'): t = Token('LOG', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); From 48ae0f1aab2e626a9211fb04b0acb6244211e6e6 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 17:04:35 -0300 Subject: [PATCH 06/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20Logical=20o?= =?UTF-8?q?perators=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 49 +++++++++++++++++++------------------------------ 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/src/main.py b/src/main.py index e8e84bd..b5877d3 100644 --- a/src/main.py +++ b/src/main.py @@ -87,6 +87,13 @@ def CommentAutomata(state: str, input: str): return 'BlockComment' return state + 'Error:_' + input; +def LogicalOperatorAutomata(state: str, input: str): + if (state == 'PossibleLogical&&' and input == '&'): + return 'LogicalOperatorFinal' + if (state == 'PossibleLogical||' and input == '|'): + return 'LogicalOperatorFinal' + return 'MalformedToken'; + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -99,6 +106,10 @@ def getNextState(state: str, input: str) -> str: return 'Identifier' elif (input == '"'): return 'String'; + elif (input == '&'): + return 'PossibleLogical&&'; + elif (input == '|'): + return 'PossibleLogical||'; return '0'; def isFinalState(state: str): @@ -109,6 +120,8 @@ def isFinalState(state: str): 'LineCommentFinal', 'BlockCommentFinal', 'ArithmeticFinal', + 'LogicalOperatorFinal', + 'MalformedToken', }; if state in finalStates: return True; @@ -121,7 +134,9 @@ def getTokenType(state: str): 'StringFinal': 'CAC', 'LineCommentFinal': 'COM', 'BlockCommentFinal': 'CMB', - 'ArithmeticFinal': 'ART' + 'ArithmeticFinal': 'ART', + 'LogicalOperatorFinal': 'LOG', + 'MalformedToken': 'TMF', } return stateToTokenType.get(state, 'None'); @@ -137,6 +152,8 @@ def findApropriateAutomata(state: str) -> Automata: return StringAutomata; elif ('Comment' in state): return CommentAutomata; + elif ('Logical' in state): + return LogicalOperatorAutomata; return ErrorAutomata; def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): @@ -193,13 +210,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '&'): - currentIndex = currentIndex + 1; - currentState = '14'; - elif (line[currentIndex] == '|'): - currentIndex = currentIndex + 1; - currentState = '16'; - elif (line[currentIndex] == '!'): + if (line[currentIndex] == '!'): currentIndex = currentIndex + 1; currentState = '12'; elif (line[currentIndex] == '=' or line[currentIndex] == '<' or line[currentIndex] == '>'): @@ -237,28 +248,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; currentState = '0'; - elif(currentState == '14'): - if (line[currentIndex] == '&'): - t = Token('LOG', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - t = Token('TMF', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex]); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 1; - currentState = '0'; - elif(currentState == '16'): - if (line[currentIndex] == '|'): - t = Token('LOG', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - t = Token('TMF', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex]); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 1; - currentState = '0'; elif(currentState == '12'): if (line[currentIndex] == '='): t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); From a9899da970f79e727f245a368297843242b8fd72 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 17:16:03 -0300 Subject: [PATCH 07/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20Logical=20n?= =?UTF-8?q?egation=20operator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main.py b/src/main.py index b5877d3..dea8743 100644 --- a/src/main.py +++ b/src/main.py @@ -92,6 +92,8 @@ def LogicalOperatorAutomata(state: str, input: str): return 'LogicalOperatorFinal' if (state == 'PossibleLogical||' and input == '|'): return 'LogicalOperatorFinal' + if (state == 'PossibleLogical!' and not input == '='): + return 'LogicalOperatorFinal' return 'MalformedToken'; def getNextState(state: str, input: str) -> str: @@ -110,6 +112,8 @@ def getNextState(state: str, input: str) -> str: return 'PossibleLogical&&'; elif (input == '|'): return 'PossibleLogical||'; + elif (input == '!'): + return 'PossibleLogical!'; return '0'; def isFinalState(state: str): @@ -210,10 +214,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '!'): - currentIndex = currentIndex + 1; - currentState = '12'; - elif (line[currentIndex] == '=' or line[currentIndex] == '<' or line[currentIndex] == '>'): + if (line[currentIndex] == '=' or line[currentIndex] == '<' or line[currentIndex] == '>'): currentIndex = currentIndex + 1; currentState = '18'; elif (line[currentIndex] == '+'): @@ -254,10 +255,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s tokensFoundInThisLine.append(t); currentState = '0'; currentIndex = currentIndex + 1; - else: - t = Token('LOG', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; elif(currentState == '18'): if (line[currentIndex] == '='): t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); From 206bfd0a03d94a9f88b763ebce837c1e46dd76d6 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 17:46:28 -0300 Subject: [PATCH 08/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20Relational?= =?UTF-8?q?=20operators=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/main.py b/src/main.py index dea8743..39bd213 100644 --- a/src/main.py +++ b/src/main.py @@ -94,8 +94,17 @@ def LogicalOperatorAutomata(state: str, input: str): return 'LogicalOperatorFinal' if (state == 'PossibleLogical!' and not input == '='): return 'LogicalOperatorFinal' + if (state == 'PossibleLogical!' and input == '='): + return 'RelationalOperatorFinal' return 'MalformedToken'; +def RelationalOperatorAutomata(state: str, input: str): + if (state == 'Relational' and input == '='): + return 'DoubleRelationalOperator' + if (state == 'DoubleRelationalOperator'): + return 'RelationalOperatorFinal' + return 'RelationalOperatorFinal'; + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -114,6 +123,8 @@ def getNextState(state: str, input: str) -> str: return 'PossibleLogical||'; elif (input == '!'): return 'PossibleLogical!'; + elif (input == '=' or input == '<' or input == '>'): + return 'Relational'; return '0'; def isFinalState(state: str): @@ -141,6 +152,7 @@ def getTokenType(state: str): 'ArithmeticFinal': 'ART', 'LogicalOperatorFinal': 'LOG', 'MalformedToken': 'TMF', + 'RelationalOperatorFinal': 'REL', } return stateToTokenType.get(state, 'None'); @@ -158,6 +170,8 @@ def findApropriateAutomata(state: str) -> Automata: return CommentAutomata; elif ('Logical' in state): return LogicalOperatorAutomata; + elif ('Relational' in state): + return RelationalOperatorAutomata; return ErrorAutomata; def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): @@ -214,10 +228,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '=' or line[currentIndex] == '<' or line[currentIndex] == '>'): - currentIndex = currentIndex + 1; - currentState = '18'; - elif (line[currentIndex] == '+'): + if (line[currentIndex] == '+'): currentIndex = currentIndex + 1; currentState = '19'; elif (line[currentIndex] == '*'): @@ -249,22 +260,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; currentState = '0'; - elif(currentState == '12'): - if (line[currentIndex] == '='): - t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - elif(currentState == '18'): - if (line[currentIndex] == '='): - t = Token('REL', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - t = Token('REL', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; + elif(currentState == '19'): if (line[currentIndex] == '+'): t = Token('ART', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); From 6c8380000ef3da89550a9e2ca5a7afe9c8bedbaa Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 18:13:02 -0300 Subject: [PATCH 09/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20Arithmetic?= =?UTF-8?q?=20operators=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/main.py b/src/main.py index 39bd213..edbe13b 100644 --- a/src/main.py +++ b/src/main.py @@ -105,6 +105,15 @@ def RelationalOperatorAutomata(state: str, input: str): return 'RelationalOperatorFinal' return 'RelationalOperatorFinal'; +def ArithmeticOperatorAutomata(state: str, input: str): + if (state == 'Arithmetic' and input == '+'): + return 'DoubleArithmeticOperator' + if (state == 'DoubleArithmeticOperator'): + return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic*'): + return 'ArithmeticOperatorFinal' + return 'ArithmeticOperatorFinal'; + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -125,6 +134,10 @@ def getNextState(state: str, input: str) -> str: return 'PossibleLogical!'; elif (input == '=' or input == '<' or input == '>'): return 'Relational'; + elif (input == '+'): + return 'Arithmetic'; + elif (input == '*'): + return 'Arithmetic*'; return '0'; def isFinalState(state: str): @@ -172,6 +185,8 @@ def findApropriateAutomata(state: str) -> Automata: return LogicalOperatorAutomata; elif ('Relational' in state): return RelationalOperatorAutomata; + elif ('Arithmetic' in state): + return ArithmeticOperatorAutomata; return ErrorAutomata; def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): @@ -228,15 +243,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '+'): - currentIndex = currentIndex + 1; - currentState = '19'; - elif (line[currentIndex] == '*'): - t = Token('ART', lineCount, currentIndex, currentIndex + 1, line[currentIndex:currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 1; - currentState = '0'; - elif (line[currentIndex] == '-'): + if (line[currentIndex] == '-'): if (line[currentIndex + 1] == '-'): t = Token('ART', lineCount, currentIndex, currentIndex + 2, line[currentIndex:currentIndex + 2]); tokensFoundInThisLine.append(t); @@ -260,17 +267,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; currentState = '0'; - - elif(currentState == '19'): - if (line[currentIndex] == '+'): - t = Token('ART', lineCount, currentIndex -1, currentIndex + 1, line[currentIndex -1: currentIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - currentIndex = currentIndex + 1; - else: - t = Token('ART', lineCount, currentIndex -1, currentIndex, line[currentIndex -1: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; elif(currentState == '20'): if (re.match(r'\d', line[currentIndex])): currentState = '21'; From 3cab81da6b69a4335819c9672cf9aecf16a884cd Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 18:41:19 -0300 Subject: [PATCH 10/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Update=20Arithmet?= =?UTF-8?q?ic=20operator=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/main.py b/src/main.py index edbe13b..45f1a77 100644 --- a/src/main.py +++ b/src/main.py @@ -112,6 +112,20 @@ def ArithmeticOperatorAutomata(state: str, input: str): return 'ArithmeticOperatorFinal' if (state == 'Arithmetic*'): return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic-' and input == '-'): + return 'DoubleArithmeticOperator' + if (state == 'DoubleArithmeticOperator'): + return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic-' and input == ' '): + return 'ArithmeticPossibleNROorART' + if (state == 'Arithmetic-' and re.match(r'\d', input)): + return 'NegativeNumber' + if (state == 'ArithmeticPossibleNROorART' and re.match(r'\d', input)): + return 'NegativeNumber' + if (state == 'ArithmeticPossibleNROorART' and input == ' '): + return 'ArithmeticPossibleNROorART' + if (state == 'ArithmeticPossibleNROorART'): + return 'ArithmeticOperatorFinal' return 'ArithmeticOperatorFinal'; def getNextState(state: str, input: str) -> str: @@ -138,6 +152,8 @@ def getNextState(state: str, input: str) -> str: return 'Arithmetic'; elif (input == '*'): return 'Arithmetic*'; + elif (input == '-'): + return 'Arithmetic-'; return '0'; def isFinalState(state: str): @@ -243,17 +259,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; if (currentState == '0'): - if (line[currentIndex] == '-'): - if (line[currentIndex + 1] == '-'): - t = Token('ART', lineCount, currentIndex, currentIndex + 2, line[currentIndex:currentIndex + 2]); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 2; - currentState = '0'; - else: - tokenStartIndex = currentIndex; - currentIndex = currentIndex + 1; - currentState = '20'; - elif (re.match(r'\d', line[currentIndex])): + if (re.match(r'\d', line[currentIndex])): tokenStartIndex = currentIndex; currentIndex = currentIndex + 1; currentState = '21'; @@ -267,15 +273,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex = currentIndex + 1; currentState = '0'; - elif(currentState == '20'): - if (re.match(r'\d', line[currentIndex])): - currentState = '21'; - elif (line[currentIndex] == ' '): - currentIndex = currentIndex + 1; - else: - t = Token('ART', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; elif(currentState == '21'): if(currentIndex + 1 >= lineLength): l = line[tokenStartIndex:] From 96ca237d87eb318609c1b7ab3164182a4692ad61 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 21:01:50 -0300 Subject: [PATCH 11/20] =?UTF-8?q?=F0=9F=9A=A7=E2=9C=A8=20Add=20Numeric=20a?= =?UTF-8?q?utomata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 131 ++++++++++++++++++++++------------------------------ 1 file changed, 55 insertions(+), 76 deletions(-) diff --git a/src/main.py b/src/main.py index 45f1a77..d859c9f 100644 --- a/src/main.py +++ b/src/main.py @@ -119,15 +119,47 @@ def ArithmeticOperatorAutomata(state: str, input: str): if (state == 'Arithmetic-' and input == ' '): return 'ArithmeticPossibleNROorART' if (state == 'Arithmetic-' and re.match(r'\d', input)): - return 'NegativeNumber' + return 'Number' if (state == 'ArithmeticPossibleNROorART' and re.match(r'\d', input)): - return 'NegativeNumber' + return 'Number' if (state == 'ArithmeticPossibleNROorART' and input == ' '): return 'ArithmeticPossibleNROorART' if (state == 'ArithmeticPossibleNROorART'): return 'ArithmeticOperatorFinal' + if (state == 'PossibleArithmeticMinus' and input == ' ' or input == '\t'): + return 'PossibleArithmeticMinus' + if (state == 'PossibleArithmeticMinus' and re.match(r'\d', input)): + return 'ArithmeticOperatorFinal' + if (state == 'PossibleArithmeticMinus'): + return 'GoBack' return 'ArithmeticOperatorFinal'; +def NumbertAutomata(state: str, input: str): + if (state == 'Number' and re.match(r'\d', input)): + return 'Number' + if (state == 'Number' and input == '.'): + return 'FPNumber' + if (state == 'FPNumber' and re.match(r'\d', input)): + return 'FPNumberComplete' + if (state == 'FPNumberComplete' and re.match(r'\d', input)): + return 'FPNumberComplete' + if (state == 'FPNumberComplete' and not re.match(r'\d', input)): + return 'NumberFinal' + if (state == 'FPNumber' and not re.match(r'\d', input)): + return 'MalformedNumber' + if (state == 'Number' and not (input == ' ' or input == '-' or input == '\t')): + return 'NumberFinal' + if (state == 'Number' and input == ' ' or input == '-' or input == '\t'): + return 'NumberFinalInPossibleOperation' + if (state == 'NumberFinalInPossibleOperation' and (input == ' ' or input == '\t')): + return 'NumberFinalInPossibleOperation' + if (state == 'NumberFinalInPossibleOperation' and input == '-'): + return 'PossibleArithmeticMinus' + if (state == 'NumberFinalInPossibleOperation'): + return 'InitialState' + else: # esse else ta errado + return 'NumberFinal' + def getNextState(state: str, input: str) -> str: if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); @@ -154,7 +186,11 @@ def getNextState(state: str, input: str) -> str: return 'Arithmetic*'; elif (input == '-'): return 'Arithmetic-'; - return '0'; + elif (re.match(r'\d', input)): + return 'Number'; + elif (input == ' ' or input == '\t' or input == '\n'): + return ('InitialState'); + return 'MalformedToken'; def isFinalState(state: str): finalStates = { @@ -166,6 +202,9 @@ def isFinalState(state: str): 'ArithmeticFinal', 'LogicalOperatorFinal', 'MalformedToken', + 'NumberFinal', + 'ArithmeticOperatorFinal', + 'RelationalOperatorFinal', }; if state in finalStates: return True; @@ -182,6 +221,10 @@ def getTokenType(state: str): 'LogicalOperatorFinal': 'LOG', 'MalformedToken': 'TMF', 'RelationalOperatorFinal': 'REL', + 'ArithmeticOperatorFinal': 'ART', + 'NumberFinal': 'NRO', + 'MalformedNumber': 'NMF', + 'MalformedNumberFinal': 'NMF', } return stateToTokenType.get(state, 'None'); @@ -203,6 +246,8 @@ def findApropriateAutomata(state: str) -> Automata: return RelationalOperatorAutomata; elif ('Arithmetic' in state): return ArithmeticOperatorAutomata; + elif ('Number' in state): + return NumbertAutomata; return ErrorAutomata; def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): @@ -236,6 +281,13 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s # Proximo estado, dado o caractere lido nextState: str = getNextState(currentState, character); + if (currentState == 'NumberFinalInPossibleOperation'): + token = generateToken(currentState, lineCount, line, tokenStartIndex, currentIndex); + tokensFoundInThisLine.append(token); + + if (currentState == 'GoBack'): + currentIndex = tokenStartIndex; + # Se for um estado final, gere um token if (isFinalState(nextState)): token = generateToken(currentState, lineCount, line, tokenStartIndex, currentIndex); @@ -258,79 +310,6 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentState = nextState # Define o priximo estado currentIndex = currentIndex + 1; - if (currentState == '0'): - if (re.match(r'\d', line[currentIndex])): - tokenStartIndex = currentIndex; - currentIndex = currentIndex + 1; - currentState = '21'; - elif(line[currentIndex] == ' ' or line[currentIndex] == '\t' or line[currentIndex] == '\n'): - currentIndex = currentIndex + 1; - currentState = '0'; - else: - mlkmk = line[currentIndex]; - t = Token('TMF', lineCount, currentIndex, currentIndex, mlkmk); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 1; - currentState = '0'; - - elif(currentState == '21'): - if(currentIndex + 1 >= lineLength): - l = line[tokenStartIndex:] - if (l[len(l) - 1] == '\n'): - l = l - #l = l[ :- 1] - t = Token('NRO', lineCount, tokenStartIndex, currentIndex, l); - tokensFoundInThisLine.append(t); - currentIndex = currentIndex + 1; - elif (re.match(r'\d', line[currentIndex])): - currentIndex = currentIndex + 1; - elif (line[currentIndex] == '.'): - currentState = '22'; - currentIndex = currentIndex + 1; - else: - t = Token('NRO', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); - tokensFoundInThisLine.append(t); - if (line[currentIndex] == ' ' or line[currentIndex] == '-' or line[currentIndex] == '\t'): - currentState = '24'; - else: - currentState = '0'; - elif(currentState == '22'): - if (re.match(r'\d', line[currentIndex])): - currentIndex = currentIndex + 1; - currentState = '23'; - else: - t = Token('NMF', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; - elif(currentState == '23'): - if (re.match(r'\d', line[currentIndex])): - currentIndex = currentIndex + 1; - else: - t = Token('NRO', lineCount, tokenStartIndex, currentIndex, line[tokenStartIndex: currentIndex]); - tokensFoundInThisLine.append(t); - currentState = '0'; - elif(currentState == '24'): - if (line[currentIndex] == ' ' or line[currentIndex] == '\t'): - currentIndex = currentIndex + 1; - elif (line[currentIndex] == '-'): - tokenStartIndex = currentIndex; - currentIndex = currentIndex + 1; - currentState = '25'; - else: - currentState = '0'; - elif(currentState == '25'): - if (line[currentIndex] == ' ' or line[currentIndex] == '\t'): - currentIndex = currentIndex + 1; - elif (re.match(r'\d', line[currentIndex])): - t = Token('ART', lineCount, tokenStartIndex, tokenStartIndex + 1, line[tokenStartIndex: tokenStartIndex + 1]); - tokensFoundInThisLine.append(t); - currentState = '0'; - else: - currentIndex = tokenStartIndex; - currentState = '0'; - else: - exitLoop = True; - if (currentState != '8'): currentState = '0'; return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); From a2a1bedd49ca50f9678f3e105c6aefb89cccb78a Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 22:04:44 -0300 Subject: [PATCH 12/20] =?UTF-8?q?=F0=9F=AA=B2=20Fixes=20in=20token=20gener?= =?UTF-8?q?ation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/filesystem.py | 2 +- src/main.py | 35 +++++++++++++++++++++++------------ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/filesystem.py b/src/filesystem.py index d9182c1..782154e 100644 --- a/src/filesystem.py +++ b/src/filesystem.py @@ -35,7 +35,7 @@ def readFileLines(opened_file: TextIOWrapper, on_line: Callable[[str, int, str, # Lê cada linha do arquivo e passa para a função de callback on_line response: list[Token] = []; - currentState: ResTokenList = ResTokenList('0', 0, 0, '', []); + currentState: ResTokenList = ResTokenList('InitialState', 0, 0, '', []); # Loop through each line via file handler for count, line in enumerate(opened_file): diff --git a/src/main.py b/src/main.py index d859c9f..cb26f8b 100644 --- a/src/main.py +++ b/src/main.py @@ -38,7 +38,7 @@ def hasNonASCII(s: str): Automata = Callable[[str, str], str]; def IdendifierAutomata(state: str, input: str): - isValid = re.match(r'[a-zA-Z]+', input) or re.match(r'\d', input); + isValid = re.match(r'[a-zA-Z]+', input) or re.match(r'\d', input) or input == '_'; if (state == 'IdentifierFinal'): return 'IdentifierFinal'; if (state == 'Identifier' and isValid): @@ -62,6 +62,8 @@ def StringAutomata(state: str, input: str): return 'StringFinal'; if (state == 'String' and input == '\n'): return 'MalformedString'; + if (state == 'MalformedString' and input == '\n'): + return 'MalformedStringFinal'; return 'String'; def CommentAutomata(state: str, input: str): @@ -161,6 +163,8 @@ def NumbertAutomata(state: str, input: str): return 'NumberFinal' def getNextState(state: str, input: str) -> str: + if (state == 'MalformedToken'): + return 'MalformedTokenFinal'; if (not state == 'InitialState'): automata: Automata = findApropriateAutomata(state); return automata(state, input); @@ -195,16 +199,18 @@ def getNextState(state: str, input: str) -> str: def isFinalState(state: str): finalStates = { 'DelimiterFinal', - 'MalformedString', + 'MalformedStringFinal', 'StringFinal', 'LineCommentFinal', 'BlockCommentFinal', 'ArithmeticFinal', 'LogicalOperatorFinal', - 'MalformedToken', 'NumberFinal', 'ArithmeticOperatorFinal', 'RelationalOperatorFinal', + 'IdentifierFinal', + 'MalformedNumberFinal', + 'MalformedTokenFinal', }; if state in finalStates: return True; @@ -213,18 +219,19 @@ def isFinalState(state: str): def getTokenType(state: str): stateToTokenType = { 'DelimiterFinal': 'DEL', - 'MalformedString': 'CMF', + 'MalformedStringFinal': 'CMF', 'StringFinal': 'CAC', 'LineCommentFinal': 'COM', 'BlockCommentFinal': 'CMB', 'ArithmeticFinal': 'ART', 'LogicalOperatorFinal': 'LOG', - 'MalformedToken': 'TMF', + 'MalformedTokenFinal': 'TMF', 'RelationalOperatorFinal': 'REL', 'ArithmeticOperatorFinal': 'ART', 'NumberFinal': 'NRO', 'MalformedNumber': 'NMF', 'MalformedNumberFinal': 'NMF', + 'IdentifierFinal': 'IDE', } return stateToTokenType.get(state, 'None'); @@ -290,17 +297,21 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s # Se for um estado final, gere um token if (isFinalState(nextState)): - token = generateToken(currentState, lineCount, line, tokenStartIndex, currentIndex); - tokensFoundInThisLine.append(token); # Apos salvar o token + token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); + if (not token.token == 'COM'): + tokensFoundInThisLine.append(token); # Apos salvar o token currentState = 'InitialState'; # Volte para o estado inicial # Do contrario, leia o proximo caractere else: # Se a linha termina e o estado não é final, decrementa o index # para chegar num estado final na proxima iteração - if (currentIndex + 1 >= lineLength): - nextState = toFinalState(nextState); # Define o estado como final - currentIndex = currentIndex - 1; + if (currentIndex + 1 >= lineLength and not nextState == 'InitialState'): + currentIndex = currentIndex + 1; + token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); + if (not token.token == 'COM'): + tokensFoundInThisLine.append(token); # Apos salvar o token + currentState = 'InitialState'; # Se há um comentario de bloco multilinha if (currentState == 'BlockCommentOverflow'): @@ -310,8 +321,8 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentState = nextState # Define o priximo estado currentIndex = currentIndex + 1; - if (currentState != '8'): - currentState = '0'; + if (currentState != 'BlockCommentOverflow'): + currentState = 'InitialState'; return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); From 2ebf0677375bfc5fd11dd96d5397ee41551bc172 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sat, 8 Oct 2022 22:38:10 -0300 Subject: [PATCH 13/20] =?UTF-8?q?=F0=9F=AA=B2=20Fix=20malformed=20numbers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/main.py b/src/main.py index cb26f8b..39a66bc 100644 --- a/src/main.py +++ b/src/main.py @@ -82,6 +82,8 @@ def CommentAutomata(state: str, input: str): if (state == 'BlockComment' and input == '*'): return 'ClosingBlockComment' if (state == 'ClosingBlockComment' and input == '/'): + return 'BlockCommentComplete' + if (state == 'BlockCommentComplete'): return 'BlockCommentFinal' if (state == 'ClosingBlockComment' and not input == '/'): return 'BlockComment' @@ -148,7 +150,7 @@ def NumbertAutomata(state: str, input: str): if (state == 'FPNumberComplete' and not re.match(r'\d', input)): return 'NumberFinal' if (state == 'FPNumber' and not re.match(r'\d', input)): - return 'MalformedNumber' + return 'MalformedNumberFinal' if (state == 'Number' and not (input == ' ' or input == '-' or input == '\t')): return 'NumberFinal' if (state == 'Number' and input == ' ' or input == '-' or input == '\t'): @@ -273,7 +275,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s currentIndex: int = 0; currentState: str = initialState; tokensFoundInThisLine: list[Token] = []; - tokenOverflow: str = overflow if initialState == '8' else ''; + tokenOverflow: str = overflow if initialState == 'BlockComment' else ''; exitLoop = False; @@ -298,7 +300,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s # Se for um estado final, gere um token if (isFinalState(nextState)): token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - if (not token.token == 'COM'): + if (not (token.token == 'COM' or token.token == 'CMB')): tokensFoundInThisLine.append(token); # Apos salvar o token currentState = 'InitialState'; # Volte para o estado inicial @@ -306,23 +308,21 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s else: # Se a linha termina e o estado não é final, decrementa o index # para chegar num estado final na proxima iteração - if (currentIndex + 1 >= lineLength and not nextState == 'InitialState'): - currentIndex = currentIndex + 1; - token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - if (not token.token == 'COM'): - tokensFoundInThisLine.append(token); # Apos salvar o token - currentState = 'InitialState'; - - # Se há um comentario de bloco multilinha - if (currentState == 'BlockCommentOverflow'): - tokenOverflow = line[tokenStartIndex:].replace('\n', ''); - exitLoop = True; + #if (currentIndex + 1 >= lineLength and not nextState == 'InitialState'): + # currentIndex = currentIndex + 1; + # token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); + # if (not (token.token == 'COM' or token.token == 'CMB')): + # tokensFoundInThisLine.append(token); # Apos salvar o token + # currentState = 'InitialState'; currentState = nextState # Define o priximo estado currentIndex = currentIndex + 1; if (currentState != 'BlockCommentOverflow'): currentState = 'InitialState'; + else: + tokenOverflow = line[tokenStartIndex:].replace('\n', ''); + currentState = 'BlockComment'; return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); From 338d716630adf01bd52063f869630e071293b15c Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 10:53:05 -0300 Subject: [PATCH 14/20] =?UTF-8?q?=F0=9F=AA=B2=20Fix=20numbers=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/main.py b/src/main.py index 39a66bc..f1394cf 100644 --- a/src/main.py +++ b/src/main.py @@ -153,14 +153,20 @@ def NumbertAutomata(state: str, input: str): return 'MalformedNumberFinal' if (state == 'Number' and not (input == ' ' or input == '-' or input == '\t')): return 'NumberFinal' - if (state == 'Number' and input == ' ' or input == '-' or input == '\t'): + if (state == 'Number' and (input == ' ' or input == '-' or input == '\t')): return 'NumberFinalInPossibleOperation' if (state == 'NumberFinalInPossibleOperation' and (input == ' ' or input == '\t')): - return 'NumberFinalInPossibleOperation' + return 'NumberFinalInPossibleOperation_' + if (state == 'NumberFinalInPossibleOperation_' and (input == ' ' or input == '\t')): + return 'NumberFinalInPossibleOperation_' if (state == 'NumberFinalInPossibleOperation' and input == '-'): return 'PossibleArithmeticMinus' + if (state == 'NumberFinalInPossibleOperation_' and input == '-'): + return 'PossibleArithmeticMinus' if (state == 'NumberFinalInPossibleOperation'): return 'InitialState' + if (state == 'NumberFinalInPossibleOperation_'): + return 'InitialState' else: # esse else ta errado return 'NumberFinal' @@ -290,12 +296,15 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s # Proximo estado, dado o caractere lido nextState: str = getNextState(currentState, character); - if (currentState == 'NumberFinalInPossibleOperation'): - token = generateToken(currentState, lineCount, line, tokenStartIndex, currentIndex); + if (nextState == 'NumberFinalInPossibleOperation'): + token = generateToken('NumberFinal', lineCount, line, tokenStartIndex, currentIndex); tokensFoundInThisLine.append(token); + tokenStartIndex = currentIndex; + currentIndex = currentIndex - 1; - if (currentState == 'GoBack'): - currentIndex = tokenStartIndex; + if (nextState == 'GoBack'): + nextState = 'InitialState'; + currentIndex = tokenStartIndex - 1; # Se for um estado final, gere um token if (isFinalState(nextState)): @@ -308,12 +317,13 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s else: # Se a linha termina e o estado não é final, decrementa o index # para chegar num estado final na proxima iteração - #if (currentIndex + 1 >= lineLength and not nextState == 'InitialState'): - # currentIndex = currentIndex + 1; - # token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - # if (not (token.token == 'COM' or token.token == 'CMB')): - # tokensFoundInThisLine.append(token); # Apos salvar o token - # currentState = 'InitialState'; + if (currentIndex + 1 >= lineLength and not (nextState == 'InitialState' or nextState == 'BlockCommentOverflow')): + nextState = getNextState(nextState, '\n'); + currentIndex = currentIndex + 2; + token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); + if (not (token.token == 'COM' or token.token == 'CMB')): + tokensFoundInThisLine.append(token); # Apos salvar o token + currentState = 'InitialState'; currentState = nextState # Define o priximo estado currentIndex = currentIndex + 1; From 999af49b101547dc3e63ddc421f79721f058eb10 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 11:48:16 -0300 Subject: [PATCH 15/20] =?UTF-8?q?=F0=9F=AA=B2=20Fix=20block=20comments=20a?= =?UTF-8?q?nd=20double=20logical=20operator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/filesystem.py | 2 +- src/main.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/filesystem.py b/src/filesystem.py index 782154e..d467c72 100644 --- a/src/filesystem.py +++ b/src/filesystem.py @@ -45,7 +45,7 @@ def readFileLines(opened_file: TextIOWrapper, on_line: Callable[[str, int, str, response = response + res.tokenList; currentState = res; - if (currentState.lastState == '8'): + if (currentState.lastState == 'BlockComment'): t = Token('CoMF', currentState.tokenStartLine, currentState.lastStartTokenIndex, 0, currentState.tokenOverflow); response.append(t); diff --git a/src/main.py b/src/main.py index f1394cf..ea757d4 100644 --- a/src/main.py +++ b/src/main.py @@ -93,13 +93,15 @@ def CommentAutomata(state: str, input: str): def LogicalOperatorAutomata(state: str, input: str): if (state == 'PossibleLogical&&' and input == '&'): + return 'DoubleLogicalOperator' + if (state == 'DoubleLogicalOperator'): return 'LogicalOperatorFinal' if (state == 'PossibleLogical||' and input == '|'): - return 'LogicalOperatorFinal' + return 'DoubleLogicalOperator' if (state == 'PossibleLogical!' and not input == '='): return 'LogicalOperatorFinal' if (state == 'PossibleLogical!' and input == '='): - return 'RelationalOperatorFinal' + return 'DoubleRelationalOperator' return 'MalformedToken'; def RelationalOperatorAutomata(state: str, input: str): @@ -321,7 +323,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s nextState = getNextState(nextState, '\n'); currentIndex = currentIndex + 2; token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - if (not (token.token == 'COM' or token.token == 'CMB')): + if (not (token.token == 'COM' or token.token == 'CMB' or token.token == 'None')): tokensFoundInThisLine.append(token); # Apos salvar o token currentState = 'InitialState'; @@ -331,7 +333,7 @@ def findTokensInString(line: str, lineCount: int, initialState: str, overflow: s if (currentState != 'BlockCommentOverflow'): currentState = 'InitialState'; else: - tokenOverflow = line[tokenStartIndex:].replace('\n', ''); + tokenOverflow = tokenOverflow + line[tokenStartIndex:].replace('\n', ''); currentState = 'BlockComment'; return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); From 6fd7e9e9d3ef8f7dc0b4dd5ba20e6ac1012a4e4c Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 11:53:21 -0300 Subject: [PATCH 16/20] =?UTF-8?q?=F0=9F=AA=B2=20Fix=20string=20automata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.py b/src/main.py index ea757d4..5a2e267 100644 --- a/src/main.py +++ b/src/main.py @@ -59,6 +59,8 @@ def ErrorAutomata(state: str, input: str): def StringAutomata(state: str, input: str): if (state == 'String' and input == '"'): + return 'StringComplete'; + if (state == 'StringComplete'): return 'StringFinal'; if (state == 'String' and input == '\n'): return 'MalformedString'; From 4e7dbbd438cccf66b2e084c1ee08dd1ab9a13cf6 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 12:49:43 -0300 Subject: [PATCH 17/20] =?UTF-8?q?=F0=9F=9A=9A=20Move=20all=20automata=20to?= =?UTF-8?q?=20automata=20package?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/automata/ArithmeticOperatorAutomata.py | 30 +++++ src/automata/CommentAutomata.py | 24 ++++ src/automata/DelimiterAutomata.py | 6 + src/automata/ErrorAutomata.py | 2 + src/automata/IdentifierAutomata.py | 12 ++ src/automata/LogicalOperatorAutomata.py | 12 ++ src/automata/NumbertAutomata.py | 33 ++++++ src/automata/RelationalOperatorAutomata.py | 6 + src/automata/StringAutomata.py | 10 ++ src/automata/findApropriateAutomata.py | 32 ++++++ src/automata/findTokensInStringAutomata.py | 127 +++++++++++++++++++++ 11 files changed, 294 insertions(+) create mode 100644 src/automata/ArithmeticOperatorAutomata.py create mode 100644 src/automata/CommentAutomata.py create mode 100644 src/automata/DelimiterAutomata.py create mode 100644 src/automata/ErrorAutomata.py create mode 100644 src/automata/IdentifierAutomata.py create mode 100644 src/automata/LogicalOperatorAutomata.py create mode 100644 src/automata/NumbertAutomata.py create mode 100644 src/automata/RelationalOperatorAutomata.py create mode 100644 src/automata/StringAutomata.py create mode 100644 src/automata/findApropriateAutomata.py create mode 100644 src/automata/findTokensInStringAutomata.py diff --git a/src/automata/ArithmeticOperatorAutomata.py b/src/automata/ArithmeticOperatorAutomata.py new file mode 100644 index 0000000..a0f6807 --- /dev/null +++ b/src/automata/ArithmeticOperatorAutomata.py @@ -0,0 +1,30 @@ +import re + +def ArithmeticOperatorAutomata(state: str, input: str): + if (state == 'Arithmetic' and input == '+'): + return 'DoubleArithmeticOperator' + if (state == 'DoubleArithmeticOperator'): + return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic*'): + return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic-' and input == '-'): + return 'DoubleArithmeticOperator' + if (state == 'DoubleArithmeticOperator'): + return 'ArithmeticOperatorFinal' + if (state == 'Arithmetic-' and input == ' '): + return 'ArithmeticPossibleNROorART' + if (state == 'Arithmetic-' and re.match(r'\d', input)): + return 'Number' + if (state == 'ArithmeticPossibleNROorART' and re.match(r'\d', input)): + return 'Number' + if (state == 'ArithmeticPossibleNROorART' and input == ' '): + return 'ArithmeticPossibleNROorART' + if (state == 'ArithmeticPossibleNROorART'): + return 'ArithmeticOperatorFinal' + if (state == 'PossibleArithmeticMinus' and input == ' ' or input == '\t'): + return 'PossibleArithmeticMinus' + if (state == 'PossibleArithmeticMinus' and re.match(r'\d', input)): + return 'ArithmeticOperatorFinal' + if (state == 'PossibleArithmeticMinus'): + return 'GoBack' + return 'ArithmeticOperatorFinal'; diff --git a/src/automata/CommentAutomata.py b/src/automata/CommentAutomata.py new file mode 100644 index 0000000..0b96834 --- /dev/null +++ b/src/automata/CommentAutomata.py @@ -0,0 +1,24 @@ +def CommentAutomata(state: str, input: str): + if (state == 'PossibleComment' and not (input == '*' or input == '/')): + return 'ArithmeticFinal' + if (state == 'PossibleComment' and input == '*'): + return 'BlockComment' + if (state == 'PossibleComment' and input == '/'): + return 'LineComment' + if (state == 'LineComment' and input == '\n'): + return 'LineCommentFinal' + if (state == 'BlockComment' and input == '\n'): + return 'BlockCommentOverflow' + if (state == 'LineComment'): + return 'LineComment' + if (state == 'BlockComment' and input == '*'): + return 'ClosingBlockComment' + if (state == 'ClosingBlockComment' and input == '/'): + return 'BlockCommentComplete' + if (state == 'BlockCommentComplete'): + return 'BlockCommentFinal' + if (state == 'ClosingBlockComment' and not input == '/'): + return 'BlockComment' + if (state == 'BlockComment'): + return 'BlockComment' + return state + 'Error:_' + input; diff --git a/src/automata/DelimiterAutomata.py b/src/automata/DelimiterAutomata.py new file mode 100644 index 0000000..86be0d5 --- /dev/null +++ b/src/automata/DelimiterAutomata.py @@ -0,0 +1,6 @@ +def DelimiterAutomata(state: str, input: str): + if (state == 'DelimiterFinal'): + return 'DelimiterFinal'; + if (state == 'Delimiter'): + return 'DelimiterFinal'; + return state + 'Error:_' + input; diff --git a/src/automata/ErrorAutomata.py b/src/automata/ErrorAutomata.py new file mode 100644 index 0000000..6be1f77 --- /dev/null +++ b/src/automata/ErrorAutomata.py @@ -0,0 +1,2 @@ +def ErrorAutomata(state: str, input: str): + return state + 'Error:_' + input; diff --git a/src/automata/IdentifierAutomata.py b/src/automata/IdentifierAutomata.py new file mode 100644 index 0000000..ecae90a --- /dev/null +++ b/src/automata/IdentifierAutomata.py @@ -0,0 +1,12 @@ +import re + + +def IdendifierAutomata(state: str, input: str): + isValid = re.match(r'[a-zA-Z]+', input) or re.match(r'\d', input) or input == '_'; + if (state == 'IdentifierFinal'): + return 'IdentifierFinal'; + if (state == 'Identifier' and isValid): + return 'Identifier' + elif (state == 'Identifier' and not isValid): + return 'IdentifierFinal' + return state + 'Error:_' + input; diff --git a/src/automata/LogicalOperatorAutomata.py b/src/automata/LogicalOperatorAutomata.py new file mode 100644 index 0000000..1f49fa6 --- /dev/null +++ b/src/automata/LogicalOperatorAutomata.py @@ -0,0 +1,12 @@ +def LogicalOperatorAutomata(state: str, input: str): + if (state == 'PossibleLogical&&' and input == '&'): + return 'DoubleLogicalOperator' + if (state == 'DoubleLogicalOperator'): + return 'LogicalOperatorFinal' + if (state == 'PossibleLogical||' and input == '|'): + return 'DoubleLogicalOperator' + if (state == 'PossibleLogical!' and not input == '='): + return 'LogicalOperatorFinal' + if (state == 'PossibleLogical!' and input == '='): + return 'DoubleRelationalOperator' + return 'MalformedToken'; diff --git a/src/automata/NumbertAutomata.py b/src/automata/NumbertAutomata.py new file mode 100644 index 0000000..45d8afa --- /dev/null +++ b/src/automata/NumbertAutomata.py @@ -0,0 +1,33 @@ +import re + +def NumbertAutomata(state: str, input: str): + if (state == 'Number' and re.match(r'\d', input)): + return 'Number' + if (state == 'Number' and input == '.'): + return 'FPNumber' + if (state == 'FPNumber' and re.match(r'\d', input)): + return 'FPNumberComplete' + if (state == 'FPNumberComplete' and re.match(r'\d', input)): + return 'FPNumberComplete' + if (state == 'FPNumberComplete' and not re.match(r'\d', input)): + return 'NumberFinal' + if (state == 'FPNumber' and not re.match(r'\d', input)): + return 'MalformedNumberFinal' + if (state == 'Number' and not (input == ' ' or input == '-' or input == '\t')): + return 'NumberFinal' + if (state == 'Number' and (input == ' ' or input == '-' or input == '\t')): + return 'NumberFinalInPossibleOperation' + if (state == 'NumberFinalInPossibleOperation' and (input == ' ' or input == '\t')): + return 'NumberFinalInPossibleOperation_' + if (state == 'NumberFinalInPossibleOperation_' and (input == ' ' or input == '\t')): + return 'NumberFinalInPossibleOperation_' + if (state == 'NumberFinalInPossibleOperation' and input == '-'): + return 'PossibleArithmeticMinus' + if (state == 'NumberFinalInPossibleOperation_' and input == '-'): + return 'PossibleArithmeticMinus' + if (state == 'NumberFinalInPossibleOperation'): + return 'InitialState' + if (state == 'NumberFinalInPossibleOperation_'): + return 'InitialState' + else: + return 'NumberFinal' diff --git a/src/automata/RelationalOperatorAutomata.py b/src/automata/RelationalOperatorAutomata.py new file mode 100644 index 0000000..c61f8d5 --- /dev/null +++ b/src/automata/RelationalOperatorAutomata.py @@ -0,0 +1,6 @@ +def RelationalOperatorAutomata(state: str, input: str): + if (state == 'Relational' and input == '='): + return 'DoubleRelationalOperator' + if (state == 'DoubleRelationalOperator'): + return 'RelationalOperatorFinal' + return 'RelationalOperatorFinal'; diff --git a/src/automata/StringAutomata.py b/src/automata/StringAutomata.py new file mode 100644 index 0000000..5742d86 --- /dev/null +++ b/src/automata/StringAutomata.py @@ -0,0 +1,10 @@ +def StringAutomata(state: str, input: str): + if (state == 'String' and input == '"'): + return 'StringComplete'; + if (state == 'StringComplete'): + return 'StringFinal'; + if (state == 'String' and input == '\n'): + return 'MalformedString'; + if (state == 'MalformedString' and input == '\n'): + return 'MalformedStringFinal'; + return 'String'; diff --git a/src/automata/findApropriateAutomata.py b/src/automata/findApropriateAutomata.py new file mode 100644 index 0000000..e4d0f13 --- /dev/null +++ b/src/automata/findApropriateAutomata.py @@ -0,0 +1,32 @@ +from typing import Callable +from automata.ArithmeticOperatorAutomata import ArithmeticOperatorAutomata +from automata.CommentAutomata import CommentAutomata +from automata.DelimiterAutomata import DelimiterAutomata +from automata.ErrorAutomata import ErrorAutomata +from automata.IdentifierAutomata import IdendifierAutomata +from automata.LogicalOperatorAutomata import LogicalOperatorAutomata +from automata.NumbertAutomata import NumbertAutomata +from automata.RelationalOperatorAutomata import RelationalOperatorAutomata +from automata.StringAutomata import StringAutomata + + +Automata = Callable[[str, str], str]; + +def findApropriateAutomata(state: str) -> Automata: + if ('Identifier' in state): + return IdendifierAutomata; + elif ('Delimiter' in state): + return DelimiterAutomata; + elif ('String' in state): + return StringAutomata; + elif ('Comment' in state): + return CommentAutomata; + elif ('Logical' in state): + return LogicalOperatorAutomata; + elif ('Relational' in state): + return RelationalOperatorAutomata; + elif ('Arithmetic' in state): + return ArithmeticOperatorAutomata; + elif ('Number' in state): + return NumbertAutomata; + return ErrorAutomata; diff --git a/src/automata/findTokensInStringAutomata.py b/src/automata/findTokensInStringAutomata.py new file mode 100644 index 0000000..0203e13 --- /dev/null +++ b/src/automata/findTokensInStringAutomata.py @@ -0,0 +1,127 @@ +import re +from TokenUtils.Token import Token +from TokenUtils.generateTokenFromState import generateTokenFromState +from automata.findApropriateAutomata import Automata, findApropriateAutomata +from filesystem import ResTokenList + +def findTokensInStringAutomata(line: str, lineCount: int, initialState: str, overflow: str) -> ResTokenList: + lineLength: int = len(line); + tokenStartIndex: int = 0; + currentIndex: int = 0; + currentState: str = initialState; + tokensFoundInThisLine: list[Token] = []; + tokenOverflow: str = overflow if initialState == 'BlockComment' else ''; + + exitLoop = False; + + while (not exitLoop and currentIndex < lineLength): + # Se ainda no estado inicial, considere o caractere atual como inicio do token + if (currentState == 'InitialState'): + tokenStartIndex = currentIndex; + + # Caractere atual + character: str = line[currentIndex]; + + # Proximo estado, dado o caractere lido + nextState: str = getNextState(currentState, character); + + if (nextState == 'NumberFinalInPossibleOperation'): + token = generateTokenFromState('NumberFinal', lineCount, line, tokenStartIndex, currentIndex); + tokensFoundInThisLine.append(token); + tokenStartIndex = currentIndex; + currentIndex = currentIndex - 1; + + if (nextState == 'GoBack'): + nextState = 'InitialState'; + currentIndex = tokenStartIndex - 1; + + # Se for um estado final, gere um token + if (isFinalState(nextState)): + token = generateTokenFromState(nextState, lineCount, line, tokenStartIndex, currentIndex); + if (not (token.token == 'COM' or token.token == 'CMB')): + tokensFoundInThisLine.append(token); # Apos salvar o token + currentState = 'InitialState'; # Volte para o estado inicial + + # Do contrario, leia o proximo caractere + else: + # Se a linha termina e o estado não é final, decrementa o index + # para chegar num estado final na proxima iteração + if (currentIndex + 1 >= lineLength and not (nextState == 'InitialState' or nextState == 'BlockCommentOverflow')): + nextState = getNextState(nextState, '\n'); + currentIndex = currentIndex + 2; + token = generateTokenFromState(nextState, lineCount, line, tokenStartIndex, currentIndex); + if (not (token.token == 'COM' or token.token == 'CMB' or token.token == 'None')): + tokensFoundInThisLine.append(token); # Apos salvar o token + currentState = 'InitialState'; + + currentState = nextState # Define o priximo estado + currentIndex = currentIndex + 1; + + if (currentState != 'BlockCommentOverflow'): + currentState = 'InitialState'; + else: + tokenOverflow = tokenOverflow + line[tokenStartIndex:].replace('\n', ''); + currentState = 'BlockComment'; + return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); + +def getNextState(state: str, input: str) -> str: + if (state == 'MalformedToken'): + return 'MalformedTokenFinal'; + if (not state == 'InitialState'): + automata: Automata = findApropriateAutomata(state); + return automata(state, input); + if (input == '/'): + return 'PossibleComment'; + elif (isDelimiter(input)): + return 'Delimiter' + elif (re.match( r'[a-zA-Z]+', input)): # Se for uma letra + return 'Identifier' + elif (input == '"'): + return 'String'; + elif (input == '&'): + return 'PossibleLogical&&'; + elif (input == '|'): + return 'PossibleLogical||'; + elif (input == '!'): + return 'PossibleLogical!'; + elif (input == '=' or input == '<' or input == '>'): + return 'Relational'; + elif (input == '+'): + return 'Arithmetic'; + elif (input == '*'): + return 'Arithmetic*'; + elif (input == '-'): + return 'Arithmetic-'; + elif (re.match(r'\d', input)): + return 'Number'; + elif (input == ' ' or input == '\t' or input == '\n'): + return ('InitialState'); + return 'MalformedToken'; + +# verifica se um caractere é um delimitador +def isDelimiter(charactere: str) -> bool: + delimiters = {'.', ';', ',', '(', ')', '[', ']', '{', '}'} + if charactere in delimiters: + return True + return False + +def isFinalState(state: str): + finalStates = { + 'DelimiterFinal', + 'MalformedStringFinal', + 'StringFinal', + 'LineCommentFinal', + 'BlockCommentFinal', + 'ArithmeticFinal', + 'LogicalOperatorFinal', + 'NumberFinal', + 'ArithmeticOperatorFinal', + 'RelationalOperatorFinal', + 'IdentifierFinal', + 'MalformedNumberFinal', + 'MalformedTokenFinal', + }; + if state in finalStates: + return True; + return False; + From 99367f8f64b7cb46e943f78c40775e76661df703 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 12:50:18 -0300 Subject: [PATCH 18/20] =?UTF-8?q?=F0=9F=9A=9A=20Move=20all=20token=20relat?= =?UTF-8?q?ed=20to=20TokenUtils=20package?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/TokenUtils/Token.py | 8 +++++++ src/TokenUtils/generateTokenFromState.py | 28 ++++++++++++++++++++++++ src/TokenUtils/getTokenTypeFromState.py | 18 +++++++++++++++ src/TokenUtils/orderTokens.py | 14 ++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 src/TokenUtils/Token.py create mode 100644 src/TokenUtils/generateTokenFromState.py create mode 100644 src/TokenUtils/getTokenTypeFromState.py create mode 100644 src/TokenUtils/orderTokens.py diff --git a/src/TokenUtils/Token.py b/src/TokenUtils/Token.py new file mode 100644 index 0000000..d4b9a57 --- /dev/null +++ b/src/TokenUtils/Token.py @@ -0,0 +1,8 @@ +from typing import NamedTuple + +class Token(NamedTuple): + token: str; + line: int; + tokenStartIndex: int; + tokenEndIndex: int; + value: str; diff --git a/src/TokenUtils/generateTokenFromState.py b/src/TokenUtils/generateTokenFromState.py new file mode 100644 index 0000000..b74b2b3 --- /dev/null +++ b/src/TokenUtils/generateTokenFromState.py @@ -0,0 +1,28 @@ +from TokenUtils.Token import Token +from TokenUtils.getTokenTypeFromState import getTokenTypeFromState + +# verifica se um caractere é uma palavra reservada +def isReserved(identifier: str) -> bool: + reserved = {'var', 'const', 'struct', 'extends', 'procedure', 'function', 'start', 'return', 'if', + 'else', 'then', 'while', 'read', 'print', 'int', 'real', 'boolean', 'string', 'true', 'false'} + if identifier in reserved: + return True + return False + +def hasNonASCII(s: str): + count = 0; + for char in s: + if (ord(char) < 32 or ord(char) > 126): + count = count + 1; + if (count > 0): + return True; + return False + +def generateTokenFromState(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int) -> Token: + tokenType = getTokenTypeFromState(state); + tokenText = lineText[tokenStartIndex:tokenEndIndex]; + if (tokenType == 'IDE'): + tokenType = 'PRE' if isReserved(tokenText) else 'IDE'; + if (tokenType == 'CAC'): + tokenType = 'CMF' if hasNonASCII(tokenText) else 'CAC'; + return Token(tokenType, lineNumber, tokenStartIndex, tokenEndIndex, tokenText); diff --git a/src/TokenUtils/getTokenTypeFromState.py b/src/TokenUtils/getTokenTypeFromState.py new file mode 100644 index 0000000..c958e2f --- /dev/null +++ b/src/TokenUtils/getTokenTypeFromState.py @@ -0,0 +1,18 @@ +def getTokenTypeFromState(state: str): + stateToTokenType = { + 'DelimiterFinal': 'DEL', + 'MalformedStringFinal': 'CMF', + 'StringFinal': 'CAC', + 'LineCommentFinal': 'COM', + 'BlockCommentFinal': 'CMB', + 'ArithmeticFinal': 'ART', + 'LogicalOperatorFinal': 'LOG', + 'MalformedTokenFinal': 'TMF', + 'RelationalOperatorFinal': 'REL', + 'ArithmeticOperatorFinal': 'ART', + 'NumberFinal': 'NRO', + 'MalformedNumber': 'NMF', + 'MalformedNumberFinal': 'NMF', + 'IdentifierFinal': 'IDE', + } + return stateToTokenType.get(state, 'None'); diff --git a/src/TokenUtils/orderTokens.py b/src/TokenUtils/orderTokens.py new file mode 100644 index 0000000..c23bf18 --- /dev/null +++ b/src/TokenUtils/orderTokens.py @@ -0,0 +1,14 @@ +from TokenUtils.Token import Token + + +def isError(err: str): + errors = {'CMF', 'CoMF', 'NMF', 'IMF', 'TMF'} + if err in errors: + return True; + return False + +def orderTokens(tk: Token): + if (isError(tk.token)): + return 1; + return 0; + \ No newline at end of file From 597f9e91e1063c64fd2a303fd3eb71f2ebffded5 Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 12:51:25 -0300 Subject: [PATCH 19/20] =?UTF-8?q?=F0=9F=94=A5=E2=9C=A8=20Update=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- src/filesystem.py | 8 +- src/main.py | 364 ++-------------------------------------------- 3 files changed, 12 insertions(+), 362 deletions(-) diff --git a/.gitignore b/.gitignore index af55aa1..a342025 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,4 @@ saida saidas_dos_testes # arquivos de cache do python -src/__pycache__ \ No newline at end of file +*.pyc diff --git a/src/filesystem.py b/src/filesystem.py index d467c72..f0e4caa 100644 --- a/src/filesystem.py +++ b/src/filesystem.py @@ -2,18 +2,12 @@ from os import listdir from os.path import isfile, join from typing import Callable, NamedTuple, TypeVar, Dict +from TokenUtils.Token import Token T = TypeVar('T'); TokenListPerFile = Dict[str, 'list[str]'] -class Token(NamedTuple): - token: str; - line: int; - tokenStartIndex: int; - tokenEndIndex: int; - value: str; - class ResTokenList(NamedTuple): lastState: str; lastStartTokenIndex: int; diff --git a/src/main.py b/src/main.py index 5a2e267..4044f0b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,356 +1,14 @@ -import re -from typing import Callable -from filesystem import ResTokenList, Token, TokenListPerFile, listDirFiles, readFileLines +from TokenUtils.Token import Token +from TokenUtils.orderTokens import isError, orderTokens +from automata.findTokensInStringAutomata import findTokensInStringAutomata +from filesystem import ResTokenList, TokenListPerFile, listDirFiles, readFileLines -LETRA = re.compile(r'/[a-zA-Z]+/g'); -DIGITO = re.compile(r'/\d/g'); - - -def onReadLine(line: str, lineNumber: int, initialState: str, overflow: str) -> ResTokenList: +def findTokensInString(line: str, lineNumber: int, initialState: str, overflow: str) -> ResTokenList: # Se a linha estiver vaxia não fax nada - tokens = findTokensInString(line, lineNumber, initialState, overflow); + tokens = findTokensInStringAutomata(line, lineNumber, initialState, overflow); return tokens; -# verifica se um caractere é um delimitador -def isDelimiter(charactere: str) -> bool: - delimiters = {'.', ';', ',', '(', ')', '[', ']', '{', '}'} - if charactere in delimiters: - return True - return False - -# verifica se um caractere é uma palavra reservada -def isReserved(identifier: str) -> bool: - reserved = {'var', 'const', 'struct', 'extends', 'procedure', 'function', 'start', 'return', 'if', - 'else', 'then', 'while', 'read', 'print', 'int', 'real', 'boolean', 'string', 'true', 'false'} - if identifier in reserved: - return True - return False - -def hasNonASCII(s: str): - count = 0; - for char in s: - if (ord(char) < 32 or ord(char) > 126): - count = count + 1; - if (count > 0): - return True; - return False - -Automata = Callable[[str, str], str]; - -def IdendifierAutomata(state: str, input: str): - isValid = re.match(r'[a-zA-Z]+', input) or re.match(r'\d', input) or input == '_'; - if (state == 'IdentifierFinal'): - return 'IdentifierFinal'; - if (state == 'Identifier' and isValid): - return 'Identifier' - elif (state == 'Identifier' and not isValid): - return 'IdentifierFinal' - return state + 'Error:_' + input; - -def DelimiterAutomata(state: str, input: str): - if (state == 'DelimiterFinal'): - return 'DelimiterFinal'; - if (state == 'Delimiter'): - return 'DelimiterFinal'; - return state + 'Error:_' + input; - -def ErrorAutomata(state: str, input: str): - return state + 'Error:_' + input; - -def StringAutomata(state: str, input: str): - if (state == 'String' and input == '"'): - return 'StringComplete'; - if (state == 'StringComplete'): - return 'StringFinal'; - if (state == 'String' and input == '\n'): - return 'MalformedString'; - if (state == 'MalformedString' and input == '\n'): - return 'MalformedStringFinal'; - return 'String'; - -def CommentAutomata(state: str, input: str): - if (state == 'PossibleComment' and not (input == '*' or input == '/')): - return 'ArithmeticFinal' - if (state == 'PossibleComment' and input == '*'): - return 'BlockComment' - if (state == 'PossibleComment' and input == '/'): - return 'LineComment' - if (state == 'LineComment' and input == '\n'): - return 'LineCommentFinal' - if (state == 'BlockComment' and input == '\n'): - return 'BlockCommentOverflow' - if (state == 'LineComment'): - return 'LineComment' - if (state == 'BlockComment' and input == '*'): - return 'ClosingBlockComment' - if (state == 'ClosingBlockComment' and input == '/'): - return 'BlockCommentComplete' - if (state == 'BlockCommentComplete'): - return 'BlockCommentFinal' - if (state == 'ClosingBlockComment' and not input == '/'): - return 'BlockComment' - if (state == 'BlockComment'): - return 'BlockComment' - return state + 'Error:_' + input; - -def LogicalOperatorAutomata(state: str, input: str): - if (state == 'PossibleLogical&&' and input == '&'): - return 'DoubleLogicalOperator' - if (state == 'DoubleLogicalOperator'): - return 'LogicalOperatorFinal' - if (state == 'PossibleLogical||' and input == '|'): - return 'DoubleLogicalOperator' - if (state == 'PossibleLogical!' and not input == '='): - return 'LogicalOperatorFinal' - if (state == 'PossibleLogical!' and input == '='): - return 'DoubleRelationalOperator' - return 'MalformedToken'; - -def RelationalOperatorAutomata(state: str, input: str): - if (state == 'Relational' and input == '='): - return 'DoubleRelationalOperator' - if (state == 'DoubleRelationalOperator'): - return 'RelationalOperatorFinal' - return 'RelationalOperatorFinal'; - -def ArithmeticOperatorAutomata(state: str, input: str): - if (state == 'Arithmetic' and input == '+'): - return 'DoubleArithmeticOperator' - if (state == 'DoubleArithmeticOperator'): - return 'ArithmeticOperatorFinal' - if (state == 'Arithmetic*'): - return 'ArithmeticOperatorFinal' - if (state == 'Arithmetic-' and input == '-'): - return 'DoubleArithmeticOperator' - if (state == 'DoubleArithmeticOperator'): - return 'ArithmeticOperatorFinal' - if (state == 'Arithmetic-' and input == ' '): - return 'ArithmeticPossibleNROorART' - if (state == 'Arithmetic-' and re.match(r'\d', input)): - return 'Number' - if (state == 'ArithmeticPossibleNROorART' and re.match(r'\d', input)): - return 'Number' - if (state == 'ArithmeticPossibleNROorART' and input == ' '): - return 'ArithmeticPossibleNROorART' - if (state == 'ArithmeticPossibleNROorART'): - return 'ArithmeticOperatorFinal' - if (state == 'PossibleArithmeticMinus' and input == ' ' or input == '\t'): - return 'PossibleArithmeticMinus' - if (state == 'PossibleArithmeticMinus' and re.match(r'\d', input)): - return 'ArithmeticOperatorFinal' - if (state == 'PossibleArithmeticMinus'): - return 'GoBack' - return 'ArithmeticOperatorFinal'; - -def NumbertAutomata(state: str, input: str): - if (state == 'Number' and re.match(r'\d', input)): - return 'Number' - if (state == 'Number' and input == '.'): - return 'FPNumber' - if (state == 'FPNumber' and re.match(r'\d', input)): - return 'FPNumberComplete' - if (state == 'FPNumberComplete' and re.match(r'\d', input)): - return 'FPNumberComplete' - if (state == 'FPNumberComplete' and not re.match(r'\d', input)): - return 'NumberFinal' - if (state == 'FPNumber' and not re.match(r'\d', input)): - return 'MalformedNumberFinal' - if (state == 'Number' and not (input == ' ' or input == '-' or input == '\t')): - return 'NumberFinal' - if (state == 'Number' and (input == ' ' or input == '-' or input == '\t')): - return 'NumberFinalInPossibleOperation' - if (state == 'NumberFinalInPossibleOperation' and (input == ' ' or input == '\t')): - return 'NumberFinalInPossibleOperation_' - if (state == 'NumberFinalInPossibleOperation_' and (input == ' ' or input == '\t')): - return 'NumberFinalInPossibleOperation_' - if (state == 'NumberFinalInPossibleOperation' and input == '-'): - return 'PossibleArithmeticMinus' - if (state == 'NumberFinalInPossibleOperation_' and input == '-'): - return 'PossibleArithmeticMinus' - if (state == 'NumberFinalInPossibleOperation'): - return 'InitialState' - if (state == 'NumberFinalInPossibleOperation_'): - return 'InitialState' - else: # esse else ta errado - return 'NumberFinal' - -def getNextState(state: str, input: str) -> str: - if (state == 'MalformedToken'): - return 'MalformedTokenFinal'; - if (not state == 'InitialState'): - automata: Automata = findApropriateAutomata(state); - return automata(state, input); - if (input == '/'): - return 'PossibleComment'; - elif (isDelimiter(input)): - return 'Delimiter' - elif (re.match( r'[a-zA-Z]+', input)): # Se for uma letra - return 'Identifier' - elif (input == '"'): - return 'String'; - elif (input == '&'): - return 'PossibleLogical&&'; - elif (input == '|'): - return 'PossibleLogical||'; - elif (input == '!'): - return 'PossibleLogical!'; - elif (input == '=' or input == '<' or input == '>'): - return 'Relational'; - elif (input == '+'): - return 'Arithmetic'; - elif (input == '*'): - return 'Arithmetic*'; - elif (input == '-'): - return 'Arithmetic-'; - elif (re.match(r'\d', input)): - return 'Number'; - elif (input == ' ' or input == '\t' or input == '\n'): - return ('InitialState'); - return 'MalformedToken'; - -def isFinalState(state: str): - finalStates = { - 'DelimiterFinal', - 'MalformedStringFinal', - 'StringFinal', - 'LineCommentFinal', - 'BlockCommentFinal', - 'ArithmeticFinal', - 'LogicalOperatorFinal', - 'NumberFinal', - 'ArithmeticOperatorFinal', - 'RelationalOperatorFinal', - 'IdentifierFinal', - 'MalformedNumberFinal', - 'MalformedTokenFinal', - }; - if state in finalStates: - return True; - return False; - -def getTokenType(state: str): - stateToTokenType = { - 'DelimiterFinal': 'DEL', - 'MalformedStringFinal': 'CMF', - 'StringFinal': 'CAC', - 'LineCommentFinal': 'COM', - 'BlockCommentFinal': 'CMB', - 'ArithmeticFinal': 'ART', - 'LogicalOperatorFinal': 'LOG', - 'MalformedTokenFinal': 'TMF', - 'RelationalOperatorFinal': 'REL', - 'ArithmeticOperatorFinal': 'ART', - 'NumberFinal': 'NRO', - 'MalformedNumber': 'NMF', - 'MalformedNumberFinal': 'NMF', - 'IdentifierFinal': 'IDE', - } - return stateToTokenType.get(state, 'None'); - -def toFinalState(state: str): - return state + 'Final'; - -def findApropriateAutomata(state: str) -> Automata: - if ('Identifier' in state): - return IdendifierAutomata; - elif ('Delimiter' in state): - return DelimiterAutomata; - elif ('String' in state): - return StringAutomata; - elif ('Comment' in state): - return CommentAutomata; - elif ('Logical' in state): - return LogicalOperatorAutomata; - elif ('Relational' in state): - return RelationalOperatorAutomata; - elif ('Arithmetic' in state): - return ArithmeticOperatorAutomata; - elif ('Number' in state): - return NumbertAutomata; - return ErrorAutomata; - -def generateToken(state: str, lineNumber: int, lineText: str, tokenStartIndex: int, tokenEndIndex: int): - tokenType = getTokenType(state); - tokenText = lineText[tokenStartIndex:tokenEndIndex]; - if (tokenType == 'IDE'): - tokenType = 'PRE' if isReserved(tokenText) else 'IDE'; - if (tokenType == 'CAC'): - tokenType = 'CMF' if hasNonASCII(tokenText) else 'CAC'; - return Token(tokenType, lineNumber, tokenStartIndex, tokenEndIndex, tokenText); - - -def findTokensInString(line: str, lineCount: int, initialState: str, overflow: str) -> ResTokenList: - lineLength: int = len(line); - tokenStartIndex: int = 0; - currentIndex: int = 0; - currentState: str = initialState; - tokensFoundInThisLine: list[Token] = []; - tokenOverflow: str = overflow if initialState == 'BlockComment' else ''; - - exitLoop = False; - - while (not exitLoop and currentIndex < lineLength): - # Se ainda no estado inicial, considere o caractere atual como inicio do token - if (currentState == 'InitialState'): - tokenStartIndex = currentIndex; - - # Caractere atual - character: str = line[currentIndex]; - - # Proximo estado, dado o caractere lido - nextState: str = getNextState(currentState, character); - - if (nextState == 'NumberFinalInPossibleOperation'): - token = generateToken('NumberFinal', lineCount, line, tokenStartIndex, currentIndex); - tokensFoundInThisLine.append(token); - tokenStartIndex = currentIndex; - currentIndex = currentIndex - 1; - - if (nextState == 'GoBack'): - nextState = 'InitialState'; - currentIndex = tokenStartIndex - 1; - - # Se for um estado final, gere um token - if (isFinalState(nextState)): - token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - if (not (token.token == 'COM' or token.token == 'CMB')): - tokensFoundInThisLine.append(token); # Apos salvar o token - currentState = 'InitialState'; # Volte para o estado inicial - - # Do contrario, leia o proximo caractere - else: - # Se a linha termina e o estado não é final, decrementa o index - # para chegar num estado final na proxima iteração - if (currentIndex + 1 >= lineLength and not (nextState == 'InitialState' or nextState == 'BlockCommentOverflow')): - nextState = getNextState(nextState, '\n'); - currentIndex = currentIndex + 2; - token = generateToken(nextState, lineCount, line, tokenStartIndex, currentIndex); - if (not (token.token == 'COM' or token.token == 'CMB' or token.token == 'None')): - tokensFoundInThisLine.append(token); # Apos salvar o token - currentState = 'InitialState'; - - currentState = nextState # Define o priximo estado - currentIndex = currentIndex + 1; - - if (currentState != 'BlockCommentOverflow'): - currentState = 'InitialState'; - else: - tokenOverflow = tokenOverflow + line[tokenStartIndex:].replace('\n', ''); - currentState = 'BlockComment'; - return ResTokenList(currentState, tokenStartIndex, lineCount, tokenOverflow, tokensFoundInThisLine); - - -def isError(err: str): - errors = {'CMF', 'CoMF', 'NMF', 'IMF', 'TMF'} - if err in errors: - return True; - return False - -def orderTokens(tk: Token): - if (isError(tk.token)): - return 1; - return 0; - +# Conta a quantidade de error léxicos presentes em uma lista de tokens def errorCount(tk: 'list[Token]'): ec = 0 for t in tk: @@ -361,8 +19,7 @@ def errorCount(tk: 'list[Token]'): def lexico(): source_directory = 'entrada'; - # Get the file handler - #fhand = open('src/entrada.txt', 'r'); + entry_files: list[str] = listDirFiles(source_directory); tokenListPerFile: TokenListPerFile = {} @@ -371,13 +28,12 @@ def lexico(): for filename in entry_files: filepath = source_directory + '/' + filename; # define o caminho para o arquivo source_file = open(filepath, 'r'); # abre o arquivo - tokensFound: list[Token] = readFileLines(source_file, onReadLine); # le os arquivos e recupera os token + tokensFound: list[Token] = readFileLines(source_file, findTokensInString); # le os arquivos e recupera os tokens tokensFound.sort(key=orderTokens); errorsNum = errorCount(tokensFound); # salva as informaações em um arquivo - #print(tabulate(tokensFound, headers=['token', 'line', 'tokeStartIndex', 'tokenEndIndex', 'value'])); tokenListPerFile[filename] = []; for token in tokensFound: formatedOutput = '{0:02d} {1:s} {2:s}\n'.format(token.line, token.token, token.value.replace('\n', '')); @@ -398,4 +54,4 @@ def lexico(): outputFile.close(); return; -lexico(); \ No newline at end of file +lexico(); From b9f43aad7912bad275243aafb94600b62f5550ff Mon Sep 17 00:00:00 2001 From: Daniel Santa Rosa Date: Sun, 9 Oct 2022 14:36:37 -0300 Subject: [PATCH 20/20] =?UTF-8?q?=F0=9F=93=9D=20Update=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef1cdf2..c4969d8 100644 --- a/README.md +++ b/README.md @@ -1 +1,136 @@ -# Compiladores + + + +## Sobre o projeto +Um analizador léxico para uma pseudo linguagem de programação escrito em python. + +A entrada para este analisador é um conjunto de arquivos texto que será processado de acordo com a estrutur léxica da linguagem e dará como saída um conjunto de arquivos de saída apresentando a lista de tokens, proveniente da análise léxica, além dos erros léxicos, caso existam. +
+ +
+ + + +## Tabela de Conteúdo + +- [Sobre o Projeto](#sobre-o-projeto) +- [Tabela de Conteúdo](#tabela-de-conte%C3%BAdo) +- [Feito Com](#feito-com) +- [Começando](#come%C3%A7ando) + - [Pré-requisitos](#pr%C3%A9-requisitos) + - [Estrutura de Arquivos](#estrutura-de-arquivos) + - [Instalação](#instala%C3%A7%C3%A3o) + - [Linting](#edi%C3%A7%C3%A3o) + - [Edição](#edi%C3%A7%C3%A3o) + - [Executar projeto](#executar-projeto) +- [Contribuição](#contribui%C3%A7%C3%A3o) + + + +
+ +## Feito Com + +Abaixo segue o que foi utilizado na criação deste projeto: + +- [Python](https://www.python.org/) - Python é uma linguagem de programação que permite trabalhar rapidamente +e integrar os sistemas de forma mais eficaz. + +
+ + + +## Começando + +Para conseguir rodar o projeto, siga os passos abaixo. + +### Pré-requisitos + +Antes de seguirmos, é preciso que você tenha o ambiente configurado para criar e testar aplicações em Python. Caso não tenha o python3 instalado na sua maquina, verifique como pode instalar na sua plataforma seguindo as instruções disponíveis na pagina do projeto: [Python.org](https://www.python.org/) + +### Estrutura de Arquivos + +A estrutura de arquivos está da seguinte maneira: + +```bash +Compiladores +├── .vscode/ +├── entrada/ +├── saida/ +├── src/ +│ ├── automata/ +│ │ └── automataName.py +│ ├── TokenUtils/ +│ │ └── utilityName.py +│ ├── filesystem.py +│ └── main.py +├── .gitignore +└── README.md +``` + +Serão explicados os arquivos e diretórios na seção de [Edição](#edição). + +### Instalação + +1. Clone o projeto utilizando o comando: + +```sh +$ git clone https://github.com/DanielSRS/Compiladores +``` + +2. Navegue para o diretorio raiz do projeto e crie as pastas nescessárias para execução com o camando: + +```sh +$ cd Compiladores +$ mkdir entrada +$ mkdir saida +``` + +### Linting +O codigo do projeto é tipado. Esta etapa não é nescessária, mas para ter uma melhor experiencia habilite linting no seu editor de preferencia, e defina a verificação de tipos como 'strict' +
+
+Se você usa o Visual Studio Code como editor não precisa fazer nada. + +
+ +### Edição + +Nesta seção haverão instruções caso você queira editar o projeto, explicando para que os diretórios são utilizados e também os arquivos de configuração. + +- **.vscode** - Arquivos de configuração do Visual Studio Code. Esses arquivos não são nescessarios caso você não use o VS Code como editor. São apenas as configurações descritas nas seção de [Linting](#linting). + +- **entrada** - Diretório contendo todos os arquivos fonte que irão ser processdos pelo analizador léxido. Se não houver nenhum arquivo, não será produzido nenhum arquivo de saíde após execução. Se diretório estiver ausente, um erro acontecerá ao executar o projeto. + +- **saida** - Após execução do projeto, o analizador léxico irá gerar arquivos de saída neste diretório contendo as informações processadas em cada arquivo de entrada. + +- **src** - Diretório contendo todos os arquivos da aplicação, é criado um diretório `src` para que o código da aplicação possa ser isolado em um diretório e facilmente portado para outros projetos, se necessário. + + - **automata** - A python package onde estão agrupados todos os automatos para processamento de lexemas e funções relacionadas + + - **tokenUtils** - A python package onde estão agrupados todos modulos para geração, processamento e manipulação de tokens além de funções relacionadas; + + - **main.py** - Arquivo responsável por centralizar o código do diretório `src`, aqui são realizadas as operções principais de abertura leitura dos arquivos de codigo fonte (presentes no arquivo de entrda) e gravação da lista de token nos arquivos de saída (no diretório 'saida'). + + - **filesystem.py** - Operações relacionadas ao sistema de arquivos, como a abertura e leitura de arquivos; + + +- **.gitignore** - Arquivo de configurção do git contendo informções de arquivos que não devem ser versionados junto com o codigo fonte; + +- **README.md** - Este arquivo. Aqui é feito a documentação basica do projeto com instruções de instalação, configuração e execução. + +## Executar projeto + +- Ainda no diretório raiz: + + ```sh + $ python3 src/main.py + ``` +- Varifique a saida no diretório 'saida' + +
+ +## Contribuição + +- Quaisquer dúvidas, sugestões ou problemas que encontrar, fique livre para abrir uma issue. +- Se quiser contribuir ajustando o codigo, implementando novas funcionalidas ou corrigindo bugs, faça um fork do projeto, faça as alterações nescessárias como descrito na seção de [Edição](#edição) e abra um pull request