词法分析 |Python 实现LL(1) 文法
文法如下:
文法:
1: lexp -> atom | list
2: atom -> number | identifier
3: list -> (lexp-seq)
4: lexp-seq -> lexp seq2
5: seq2 -> lexp seq2 | ε
需要判断是字符串为(a(b(2))(c)) 并且输出详细的步骤
'''
文法:
1: lexp -> atom | list
2: atom -> number | identifier
3: list -> (lexp-seq)
4: lexp-seq -> lexp seq2
5: seq2 -> lexp seq2 | ε
'''
# Token类型
TOKEN_NUMBER = 'NUMBER' # number
TOKEN_IDENTIFIER = 'IDENTIFIER' # identifier
TOKEN_LPAREN = 'LPAREN' # (
TOKEN_RPAREN = 'RPAREN' # )
TOKEN_EOF = 'EOF' # 结束符
class Token:
def __init__(self, type, value=None):
self.type = type # 类型
self.value = value # 值
class Lexer:
def __init__(self, text):
self.text = text # 输入文本
self.pos = 0
self.current_char = self.text[self.pos] if text else None
def error(self):
raise Exception('词法分析错误')
def advance(self):
self.pos += 1 # 移动到下一个字符
if self.pos > len(self.text) - 1:
self.current_char = None # 到达末尾
else:
self.current_char = self.text[self.pos] # 更新当前字符
def skip_whitespace(self):
while self.current_char and self.current_char.isspace():
self.advance() # 跳过空白字符
def number(self):
result = ''
while self.current_char and self.current_char.isdigit():
result += self.current_char
self.advance()
return int(result) # 返回整数
def identifier(self):
result = ''
while self.current_char and self.current_char.isalnum():
result += self.current_char
self.advance()
return result # 返回标识符
def get_next_token(self):
while self.current_char:
if self.current_char.isspace():
self.skip_whitespace() # 跳过空白字符
continue
if self.current_char.isdigit():
return Token(TOKEN_NUMBER, self.number()) # 返回数字
if self.current_char.isalpha():
return Token(TOKEN_IDENTIFIER, self.identifier()) # 返回标识符
if self.current_char == '(':
self.advance()
return Token(TOKEN_LPAREN, '(') # 返回左括号
if self.current_char == ')':
self.advance()
return Token(TOKEN_RPAREN, ')') # 返回右括号
self.error()
return Token(TOKEN_EOF) # 返回结束符
class Parser:
def __init__(self, lexer):
self.lexer = lexer
self.current_token = self.lexer.get_next_token()
self.step = 1 # 添加步骤计数
# 构建First集
self.FIRST = {
'lexp': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp -> atom | list
'atom': {TOKEN_NUMBER, TOKEN_IDENTIFIER}, # atom -> number | identifier
'list': {TOKEN_LPAREN}, # list -> (lexp-seq)
'lexp-seq': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp-seq -> lexp seq2
'seq2': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN, 'ε'} # seq2 -> lexp seq2 | ε
}
# 构建Follow集
self.FOLLOW = {
'lexp': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp -> atom | list
'atom': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # atom -> number | identifier
'list': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # list -> (lexp-seq)
'lexp-seq': {TOKEN_RPAREN}, # lexp-seq -> lexp seq2
'seq2': {TOKEN_RPAREN} # seq2 -> lexp seq2 | ε
}
# 构建预测分析表
self.PREDICT = {
('lexp', TOKEN_NUMBER): 'atom', # lexp -> atom
('lexp', TOKEN_IDENTIFIER): 'atom', # lexp -> atom
('lexp', TOKEN_LPAREN): 'list', # lexp -> list
('atom', TOKEN_NUMBER): 'number', # atom -> number
('atom', TOKEN_IDENTIFIER): 'identifier', # atom -> identifier
('list', TOKEN_LPAREN): '(lexp-seq)',
('lexp-seq', TOKEN_NUMBER): 'lexp seq2', # lexp-seq -> lexp seq2
('lexp-seq', TOKEN_IDENTIFIER): 'lexp seq2', # lexp-seq -> lexp seq2
('lexp-seq', TOKEN_LPAREN): 'lexp seq2', # lexp-seq -> lexp seq2
('seq2', TOKEN_NUMBER): 'lexp seq2', # seq2 -> lexp seq2
('seq2', TOKEN_IDENTIFIER): 'lexp seq2', # seq2 -> lexp seq2
('seq2', TOKEN_LPAREN): 'lexp seq2', # seq2 -> lexp seq2
('seq2', TOKEN_RPAREN): 'ε' # seq2 -> ε
}
def predict(self, non_terminal):
"""使用预测分析表进行分析"""
key = (non_terminal, self.current_token.type)
if key not in self.PREDICT:
self.error()
return self.PREDICT[key]
def error(self):
raise Exception('语法分析错误')
def eat(self, token_type, stack=''):
if self.current_token.type == token_type:
# 显示终结符匹配
remaining_input = self.get_remaining_input()
match_str = f"匹配 {self.current_token.value}" if self.current_token.value else f"匹配 {token_type}"
self.print_step(stack, remaining_input, match_str)
self.current_token = self.lexer.get_next_token()
else:
self.error()
def print_step(self, stack, input_str, action):
"""打印当前分析步骤"""
# 只保留栈的最后一个非终结符
stack_parts = stack.split()
short_stack = stack_parts[-1] if stack_parts else ''
# 缩短输入显示,只显示前15个字符
short_input = input_str[:15] + ('...$' if len(input_str) > 15 else '')
print(f"{self.step:<3} {short_stack:<15} {short_input:<15} {action}")
self.step += 1
def get_remaining_input(self):
"""获取剩余输入"""
pos = self.lexer.pos - 1 if self.current_token.type != TOKEN_EOF else self.lexer.pos
return self.lexer.text[pos:] + '$'
# lexp -> atom | list
def lexp(self, stack=''):
current_stack = f"{stack}lexp"
remaining_input = self.get_remaining_input()
production = self.predict('lexp')
if production == 'atom':
self.print_step(current_stack, remaining_input, 'lexp -> atom')
self.atom(current_stack)
elif production == 'list':
self.print_step(current_stack, remaining_input, 'lexp -> list')
self.list(current_stack)
else:
self.error()
# atom -> number | identifier
def atom(self, stack=''):
current_stack = f"{stack} atom"
remaining_input = self.get_remaining_input()
if self.current_token.type == TOKEN_NUMBER:
self.print_step(current_stack, remaining_input, 'atom -> number')
self.eat(TOKEN_NUMBER, current_stack) # 传入当前栈
elif self.current_token.type == TOKEN_IDENTIFIER:
self.print_step(current_stack, remaining_input, 'atom -> identifier')
self.eat(TOKEN_IDENTIFIER, current_stack) # 传入当前栈
else:
self.error()
# list -> (lexp-seq)
def list(self, stack=''):
current_stack = f"{stack} list"
remaining_input = self.get_remaining_input()
self.print_step(current_stack, remaining_input, 'list -> (lexp-seq)')
self.eat(TOKEN_LPAREN, current_stack) # 传入当前栈
self.lexp_seq(current_stack)
self.eat(TOKEN_RPAREN, current_stack) # 传入当前栈
# lexp-seq -> lexp seq2
def lexp_seq(self, stack=''):
current_stack = f"{stack} (lexp-seq)"
remaining_input = self.get_remaining_input()
self.print_step(current_stack, remaining_input, 'lexp-seq -> lexp seq2')
self.lexp(current_stack)
self.seq2(current_stack)
# seq2 -> lexp seq2 | ε
def seq2(self, stack=''):
current_stack = f"{stack} seq2"
remaining_input = self.get_remaining_input()
production = self.predict('seq2')
if production == 'lexp seq2':
self.print_step(current_stack, remaining_input, 'seq2 -> lexp seq2')
self.lexp(current_stack)
self.seq2(current_stack)
elif production == 'ε':
self.print_step(current_stack, remaining_input, 'seq2 -> ε')
return
else:
self.error()
def parse(self):
print(f"{'步骤':<3} {'栈':<15} {'输入':<15} {'动作'}")
print("-" * 50)
self.lexp()
if self.current_token.type != TOKEN_EOF:
self.error()
return True
def main():
# 测试用例
test = "(a(b(2))(c))"
print(f"\n分析输入: {test}\n")
try:
lexer = Lexer(test)
parser = Parser(lexer)
result = parser.parse()
print("\n语法分析结果: 合法")
except Exception as e:
print(f"\n语法分析结果: 不合法 ({str(e)})")
if __name__ == '__main__':
main()
执行效果


