实验目的

设计并实现一个包含预处理功能的词法分析程序，加深对编译中词法分析过程的理解。

实验要求

1、实现预处理功能

源程序中可能包含有对程序执行无意义的符号，要求将其剔除。
首先编制一个源程序的输入过程，从键盘、文件或文本框输入若干行语句，依次存入输入缓冲区（字符型数据）；然后编制一个预处理子程序，去掉输入串中的回车符、换行符和跳格符等编辑性文字；把多个空白符合并为一个；去掉注释。

2、实现词法分析功能

输入：所给文法的源程序字符串。
输出：二元组（syn,token或sum）构成的序列。其中，
syn为单词种别码。
Token为存放的单词自身字符串。
Sum为整型常量。
具体实现时，可以将单词的二元组用结构进行处理。

3、待分析的C语言子集的词法(可以自行扩充，也可以按照C语言的词法定义)

1）关键字
main if then while do static int double struct break else long switch case typedef char return const float short continue for void default sizeof do
所有的关键字都是小写。
2）运算符和界符
+ - * / : := < <> <= > >= = ; ( ) #
3）其他标记ID和NUM
通过以下正规式定义其他标记：
ID→letter(letter|digit)*
NUM→digit digit*
letter→a|…|z|A|…|Z
digit→0|…|9…
4）空格由空白、制表符和换行符组成
空格一般用来分隔ID、NUM、专用符号和关键字，词法分析阶段通常被忽略。

4、各种单词符号对应的种别码

题目提供的种别码有部分错误，实际要求是可自行定义，这里提供一份参考:

keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}

5、词法分析程序的主要算法思想

算法的基本任务是从字符串表示的源程序中识别出具有独立意义的单词符号，其基本思想是根据扫描到的单词符号的第一个字符的种类，拼出相应的单词符号。

代码：

import re
def pre(file):
    data = file.read()
    out =[]
    data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)
    data = re.sub(re.compile('//[\s\S]*?\n'), "", data)
    data = data.split("\n")
    for i in data:
        i = i.strip(' ').replace('\t', '')
        i = ' '.join(str(i).split())
        if(i!=''):
            out.append(i)
    return out
keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
with open('test.txt', 'r') as file:
    data = pre(file)
    #print(data)
    for i in range(len(data)):
        pattern1 = re.compile('[a-zA-Z.0-9]+')
        line = re.findall(pattern1,data[i])
        for j in line:
            if j in keyword:
                print(j+' -> '+str(keyword[j]))
            elif str(j).isdigit():
                print("'"+str(j)+"' -> 26")
            else:
                j = str(j).strip('.')
                print("'"+j+"' -> 25")
        line2 = re.sub(pattern1," ",data[i])
        line2=line2.split(" ")
        for j in line2:
            if j in operator:
                print(j+' -> '+str(operator[j]))

测试文本：

#include<stdio.h>
struct student
{
    int id;
    long int counts;
    /*

asfagsaf

    */
    /* data */
};
student stu[2000000];
int main(){
    for(long int i=0;i<2000000;i++){
        stu[i].id=i;
        stu[i].counts=0;
    }
    long int n,m;
    int a;
    scanf("%d",&n);
    for(long int i=0;i<n;i++){
        scanf("%ld",&a);
        stu[a].counts++;
    }
    scanf("%ld",&m);
    for(long int i=0;i<m;i++){
        scanf("%d",&a);
        if(stu[a].counts==0){
            printf("NO\n");
        }
        else{
            printf("YES\n");
        }
    }
    return 0;
}

输出：

include -> 44
stdio.h -> 40
# -> 0
< -> 33
> -> 36
struct -> 9
'student' -> 25
{ -> 46
int -> 7
'id' -> 25
; -> 41
long -> 12
int -> 7
'counts' -> 25
; -> 41
'student' -> 25
'stu' -> 25
'2000000' -> 26
int -> 7
main -> 1
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'2000000' -> 26
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
'stu' -> 25
'i' -> 25
'id' -> 25
'i' -> 25
= -> 38
; -> 41
'stu' -> 25
'i' -> 25
'counts' -> 25
'0' -> 26
= -> 38
; -> 41
} -> 47
long -> 12
int -> 7
'n' -> 25
'm' -> 25
; -> 41
int -> 7
'a' -> 25
; -> 41
scanf -> 48
'd' -> 25
'n' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'n' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'ld' -> 25
'a' -> 25
'stu' -> 25
'a' -> 25
'counts' -> 25
} -> 47
scanf -> 48
'ld' -> 25
'm' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'm' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'd' -> 25
'a' -> 25
if -> 2
'stu' -> 25
'a' -> 25
'counts' -> 25
'0' -> 26
( -> 42
printf -> 49
'NO' -> 25
'n' -> 25
} -> 47
else -> 11
{ -> 46
printf -> 49
'YES' -> 25
'n' -> 25
} -> 47
} -> 47
return -> 17
'0' -> 26
; -> 41
} -> 47

在这里插入图片描述

注意要点：

预处理文件后，从数据中提取单词和符号的过程使用了正则表达式，为[a-zA-Z.0-9]+，这里会出现一个小bug，算是逻辑漏洞，就是面向比如stu[i].counts=0;这样的语句，并没有根据.把语句前后分开，而是把.合并到了.counts里。解决方法是在输出的时候把.去掉。
预处理文件用到的正则表达式：/\*{1,2}[\s\S]*?\*/，目的是去除多行注释，//[\s\S]*?\n目的是去除单行注释。