python实现c语言测试,Python实现的C语言词法分析

2023-09-24 11 0

编译原理课上的一个实验是做一个编译器前端的词法分析器,我选择了用Python来写C语言的词法分析。

词法分析器的功能是输入源程序,输出单词符号。当初定义Token(单词种别,属性值)序列的时候,是将单词种别用数字来表示,后来再做语法分析的时候,发现用数字时不太合理的,所以又对单词的种别码进行了一番修改。

我的程序的总体思路是先对源程序进行一遍扫描,将多余的空格和注释去除,然后再读一遍已经进行过预处理的源程序,进行单词的识别,转换成二元组,保存到token文件中,并建立符号表对标识符进行管理,如果发现了错误,对其的位置和错误信息进行打印。

在对单词的识别部分,我采用了有穷自动机的理论来进行识别。这样就可以根据现在的状态和输入符号决定其后继行为。因此在对单词的识别中,我画了很多的状态图来识别不同的单词,如字符串、数字等等。状态图的绘制中,本来想用visio来画的,后来的后来觉得太麻烦了,还是用了最快的手画的方法。

0818b9ca8b590ca3270a3433284dd417.png

图1.注释的状态转换图

0818b9ca8b590ca3270a3433284dd417.png

图2.标志符的状态装换图

0818b9ca8b590ca3270a3433284dd417.png

图3.字符串的状态装换图

0818b9ca8b590ca3270a3433284dd417.png

图4.界符的状态转换图

0818b9ca8b590ca3270a3433284dd417.png

图5.整常数、浮点常数的状态转换图

0818b9ca8b590ca3270a3433284dd417.png

图6.字符常数的状态转换图

关于错误处理的方面,我对于词法分析阶段所能遇到的几种错误,如下图所示中的四种中的前三种都进行了相应的处理。但是对于第三点做的不太好,对字符常数中可以出现的字符限制的有点过于厉害,例如分号等在我的词法分析器中是不能再字符串中出现的。

0818b9ca8b590ca3270a3433284dd417.png

图7.词法分析中的四种错误

测试程序如下,内包含主要的C语言的各种语句,含有少量的错误:

int main()

{

int _a;

char ch = 'f;

floatb,centigrade,fahrj@enheit;

char fd = '\n';

printf("please inputa);

scanf("%d",&a); /*mycomment1or***2*/

printf("please inputb");

scanf("%f",&b);

if (a==8.1.6)

{

centigrade=095*(b-32)/9; /*itismyc5435omment*/

printf("TheCentigrade is ",centigrade); /*mess/age*/

}

else if (a!=0)

{

fahrenheit=(9/5.0)*b++32; /*mycontent*/

printf("TheFahrenheit is fahrenheit); /*hello****/

}

return 0;

}

运行结果如下图所示:

0818b9ca8b590ca3270a3433284dd417.png

图8.测试程序的错误报告

这是用Python写的第一个稍微像点样的东西,所以很多地方写的不大好,代码结构也是有点混乱。总而言之,就是在这样的条件下把编译原理的第一次实验给写完了。接下来是我的水水的代码了。

# -*- coding: utf-8 -*-

'''

Created on 2012-10-18

@author: zouliping

'''

import string

_key = ("auto","break","case","char","const","continue","default",

"do","double","else","enum","extern","float","for",

"goto","if","int","long","register","return","short",

"signed","static","sizeof","struct","switch","typedef","union",

"unsigned","void","volatile","while") # c语言的32个关键字

_abnormalChar = '@#$%^&*~' #标识符中可能出现的非法字符

_syn = '' #单词的种别码

_p = 0 #下标

_value = '' #存放词法分析出的单词

_content = '' #程序内容

_mstate = 0 #字符串的状态

_cstate = 0 #字符的状态

_dstate = 0 #整数和浮点数的状态

_line = 1 #代码的第几行

_mysymbol = [] #符号表

def outOfComment():

'''去除代码中的注释'''

global _content

state = 0

index = -1

for c in _content:

index = index + 1

if state == 0:

if c == '/':

state = 1

startIndex = index

elif state == 1:

if c == '*':

state = 2

else:

state = 0

elif state == 2:

if c == '*':

state = 3

else:

pass

elif state == 3:

if c == '/':

endIndex = index + 1

comment = _content[startIndex:endIndex]

_content = _content.replace(comment,'') #将注释替换为空,并且将下标移动

index = startIndex - 1

state = 0

elif c == '*':

pass

else:

state = 2

def getMyProm():

'''从文件中获取代码片段'''

global _content

myPro = open(r'E://test.txt','r')

for line in myPro:

if line != '\n':

_content = "%s%s" %(_content,line.lstrip()) #效率更高的字符串拼接方法

else:

_content = "%s%s" %(_content,line)

myPro.close()

def analysis(mystr):

'''分析目标代码,生成token'''

global _p,_value,_syn,_mstate,_dstate,_line,_cstate

_value = ''

ch = mystr[_p]

_p += 1

while ch == ' ':

ch = mystr[_p]

_p += 1

if ch in string.letters or ch == '_': ###############letter(letter|digit)*

while ch in string.letters or ch in string.digits or ch == '_' or ch in _abnormalChar:

_value += ch

ch = mystr[_p]

_p += 1

_p -= 1

for abnormal in _abnormalChar:

if abnormal in _value:

_syn = '@-6' #错误代码,标识符中含有非法字符

break

else:

_syn = 'ID'

for s in _key:

if cmp(s,_value) == 0:

_syn = _value.upper() #############关键字

break

if _syn == 'ID':

inSymbolTable(_value)

elif ch == '\"': #############字符串

while ch in string.letters or ch in '\"% ' :

_value += ch

if _mstate == 0:

if ch == '\"':

_mstate = 1

elif _mstate == 1:

if ch == '\"':

_mstate = 2

ch = mystr[_p]

_p += 1

if _mstate == 1:

_syn = '@-2' #错误代码,字符串不封闭

_mstate = 0

elif _mstate == 2:

_mstate = 0

_syn = 'STRING'

_p -= 1

elif ch in string.digits:

while ch in string.digits or ch == '.' or ch in string.letters:

_value += ch

if _dstate == 0:

if ch == '0':

_dstate = 1

else:

_dstate = 2

elif _dstate == 1:

if ch == '.':

_dstate = 3

else:

_dstate = 5

elif _dstate == 2:

if ch == '.':

_dstate = 3

ch = mystr[_p]

_p += 1

for char in string.letters:

if char in _value:

_syn = '@-7' #错误代码,数字和字母混合,如12AB56等

_dstate = 0

if _syn != '@-7':

if _dstate == 5:

_syn = '@-3' #错误代码,数字以0开头

_dstate = 0

else:

_dstate = 0

if '.' not in _value:

_syn = 'DIGIT' ##################digit digit*

else:

if _value.count('.') == 1:

_syn = 'FRACTION' ################## 浮点数

else:

_syn = '@-5' #错误代码,浮点数中包含多个点,如1.2.3

_p -= 1

elif ch == '\'': ################## 字符

while ch in string.letters or ch in '@#$%&*\\\'\"':

_value += ch

if _cstate == 0:

if ch == '\'':

_cstate = 1

elif _cstate == 1:

if ch == '\\':

_cstate = 2

elif ch in string.letters or ch in '@#$%&*':

_cstate = 3

elif _cstate == 2:

if ch in 'nt':

_cstate = 3

elif _cstate == 3:

if ch == '\'':

_cstate = 4

ch = mystr[_p]

_p += 1

_p -= 1

if _cstate == 4:

_syn = 'CHARACTER'

_cstate = 0

else:

_syn = '@-4' #错误代码,字符不封闭

_cstate = 0

elif ch == '

_value = ch

ch = mystr[_p]

if ch == '=': ########### '<='

_value += ch

_p += 1

_syn = '<='

else: ########### '

_syn = '

elif ch == '>':

_value = ch

ch = mystr[_p]

if ch == '=': ########### '>='

_value += ch

_p += 1

_syn = '>='

else: ########## '>'

_syn = '>'

elif ch == '!':

_value = ch

ch = mystr[_p]

if ch == '=': ########## '!='

_value += ch

_p += 1

_syn = '!='

else: ########## '!'

_syn = '!'

elif ch == '+':

_value = ch

ch = mystr[_p]

if ch =='+': ############ '++'

_value += ch

_p += 1

_syn = '++'

else : ############ '+'

_syn = '+'

elif ch == '-':

_value = ch

ch = mystr[_p]

if ch =='-': ########### '--'

_value += ch

_p += 1

_syn = '--'

else : ########### '-'

_syn = '-'

elif ch == '=':

_value = ch

ch = mystr[_p]

if ch =='=': ########### '=='

_value += ch

_p += 1

_syn = '=='

else : ########### '='

_syn = '='

elif ch == '&':

_value = ch

ch = mystr[_p]

if ch == '&': ########### '&&'

_value += ch

_p += 1

_syn = '&&'

else: ########### '&'

_syn = '&'

elif ch == '|':

_value = ch

ch = mystr[_p]

if ch == '|': ########## '||'

_value += ch

_p += 1

_syn = '||'

else: ########## '|'

_syn = '|'

elif ch == '*': ########## '*'

_value = ch

_syn = '*'

elif ch == '/': ########## '/'

_value = ch

_syn = '/'

elif ch ==';': ########## ';'

_value = ch

_syn = ';'

elif ch == '(': ########## '('

_value = ch

_syn = '('

elif ch == ')': ########### ')'

_value = ch

_syn = ')'

elif ch == '{': ########### '{'

_value = ch

_syn = '{'

elif ch == '}': ########### '}'

_value = ch

_syn = '}'

elif ch == '[': ########### '['

_value = ch

_syn = '['

elif ch == ']': ########### ']'

_value = ch

_syn = ']'

elif ch == ',': ########## ','

_value = ch

_syn = ','

elif ch == '\n':

_syn = '@-1'

def inSymbolTable(token):

'''将关键字和标识符存进符号表'''

global _mysymbol

if token not in _mysymbol:

_mysymbol.append(token)

if __name__ == '__main__':

getMyProm()

outOfComment()

symbolTableFile = open(r'E://symbol_table.txt','w')

tokenFile = open(r'E://token.txt','w')

while _p != len(_content):

analysis(_content)

if _syn == '@-1':

_line += 1 #记录程序的行数

elif _syn == '@-2':

print '字符串 ' + _value + ' 不封闭! Error in line ' + str(_line)

elif _syn == '@-3':

print '数字 ' + _value + ' 错误,不能以0开头! Error in line ' + str(_line)

elif _syn == '@-4':

print '字符 ' + _value + ' 不封闭! Error in line ' + str(_line)

elif _syn == '@-5':

print '数字 ' + _value + ' 不合法! Error in line ' + str(_line)

elif _syn == '@-6':

print '标识符' + _value + ' 不能包含非法字符!Error in line ' + str(_line)

elif _syn == '@-7':

print '数字 ' + _value + ' 不合法,包含字母! Error in line ' + str(_line)

else: #若程序中无词法错误的情况

#print (_syn,_value)

tokenFile.write(str(_syn)+'@'+_value+'\n')

tokenFile.close()

symbolTableFile.write('入口地址\t变量名\n')

i = 0

for symbolItem in _mysymbol:

symbolTableFile.write(str(i)+'\t\t\t'+symbolItem+'\n')

i += 1

symbolTableFile.close()

代码编程
赞赏

相关文章

动态改变shape color
一张图看懂阿里云网络产品[一]网络产品概览
bug人生–CF的那段时光
地图常见操作总结
WebView详解:Android和Js交互
西门子医疗创新产品与解决方案亮相第31届国际医疗仪器设备展览会