非常详细(包括跳过注释部分),不多说直接上代码(结合代码讲解)
#include<bits/stdc++.h>
using namespace std;
#define ARRAY_LENGTH(arr) (sizeof(arr) / sizeof(arr[0]))
//关键词集合
string KEY_WORD[] = {"int","char","string","bool","float","double","true","false","return","if","else","while","for","default","do","public","static","switch","case","private","protected"};
//界符集合,我将'/'也加到界符表中用于判断是否是注释
char BOUND_CHAR[] = {',', ';', '(', ')', '{', '}', '[', ']', '\'', '\"', '/'};
int pos = 0;//当前读到的数的下标
short in_annotation = 0;//当前是否在注释的范围,1表示在并且是"//"这种类型,2表示在并且是"/**/"这种注释类型,默认不在(因为两种注释的退出条件不同)
// 关键字,标识符,运算符,界符和常量
enum WordTypeKind{
KEYWORD, IDENTIFIER, CONSTANT, OPERATOR, DELIMITER, ANNOTATION, ERROR
};
//定义词
struct WORD {
WordTypeKind wordType;
string value;
};
//读取文件中的内容
string openFile(string fileName) {
ifstream readFile(fileName);
//文件是否打开
if(!readFile.is_open()) {
cerr << "无法打开文件!" << '\n';
return "";
}
string content, temp;
while(getline(readFile, temp)) {
// cout << content << endl;
content += temp;
content += '\n';
}
//关闭资源
readFile.close();
return content;
}
//将字符串写入文件
bool writeFile(string fileName, string content) {
ofstream writeFile(fileName);
if(!writeFile.is_open()) {
cerr << "无法打开文件!" << '\n';
return false;
}
writeFile << content << endl;
writeFile.close();
return true;
}
//判断一个单词是不是关键字,是返回true
bool isKeyWord(string word) {
for(int i = 0; i < ARRAY_LENGTH(KEY_WORD); i++) {
if(word == KEY_WORD[i]) return true;
}
return false;
}
//分词器,将字符串分成最小单位(关键字,标识符,运算符,界符和常量),关键字我们可以自己根据使用的高级语句自定义
WORD getNextWord(string str) {
string tempStr = "";//暂存这个单词
WORD newWord;//返回的词
newWord.wordType = ERROR;//方便后续退出循环
while (pos < str.length() && std::isspace(str[pos])) {
if(in_annotation == 1 && str[pos] == '\n') in_annotation = 0;//退出注释状态
++pos; // 跳过空白字符
}
if (pos >= str.length()) {
return newWord; // 结束
}
if(pos < str.length()) {//不越界
char c = str[pos++];
tempStr += c;
//标识符的命名规范:只能以字母或'_'开头
if(isalpha(c) || c == '_') {//这个词是关键词或标识符
// isalnum(str[pos])这个函数用来检查传递给它的字符是否是字母(isalpha)或者是数字(isdigit)
while(pos < str.length() && (isalnum(str[pos]) || str[pos] == '_')) {
tempStr += str[pos++];
}
//判断这个单词是标识符or关键字
if(isKeyWord(tempStr)) {//是关键字
newWord.wordType = KEYWORD;
} else {//标识符
newWord.wordType = IDENTIFIER;
}
} else if(isdigit(c)) {//数字开头只可能是常数,我们把所有数字读完
while(pos < str.length() && isdigit(str[pos])) {
tempStr += str[pos++];
}
newWord.wordType = CONSTANT;
} else {//只可能是运算符或界符
for(int i = 0; i < ARRAY_LENGTH(BOUND_CHAR); i++) {//是不是界符
if(c == BOUND_CHAR[i]) {
//遇到'/'判断是不是注释和注释类型
if(c == '/' && pos < str.length() && str[pos] == '/') {//是'//'类型
in_annotation = 1;
newWord.wordType = ANNOTATION;
pos++;
break;
} else if(c == '/' && pos < str.length() && str[pos] == '*') {//是'/**/'类型
in_annotation = 2;
newWord.wordType = ANNOTATION;
pos++;
break;
}
newWord.wordType = DELIMITER;
break;
}
}
//是运算符,注意双目运算符(三目运算符我们就不考虑了)
if(pos < str.length()) {//注意不要越界
newWord.wordType = OPERATOR;
char nextChar = str[pos];
//特判一下'*/'的情况,因为这是第二种注释的退出标识
if(c == '*' && nextChar == '/') {//退出注释状态,下面也不用看了
in_annotation = 0;
tempStr += nextChar;
newWord.wordType = ANNOTATION;
pos++;
newWord.value = tempStr;
return newWord;
}
//考虑一下所有的双目运算符
if((c == '+' || c == '-' || c == '*' || c == '/' || c == '!' || c == '^' || c == '%' || c == '=' || c == '<' || c == '>') && nextChar == '=') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '+' && nextChar == '+') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '-' && nextChar == '-') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '&' && nextChar == '&') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '|' && nextChar == '|') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '<' && nextChar == '<') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
if(c == '>' && nextChar == '>') {
tempStr += nextChar;//更新一下nextChar
pos++;
}
}
}
newWord.value = tempStr;
}
return newWord;
}
//换行符占一个长度!!!
int main() {
//先读取txt文件
string fileName = "E:\\program\\bianyiyuanli\\1.txt";
string str = openFile(fileName);
WORD word;
while((word = getNextWord(str)).wordType != ERROR) {
if(word.wordType != ANNOTATION && in_annotation == 0){//我们只有在词语不是注释类型和不在注释状态才输出它
std::cout << "[" << word.wordType << ", " << word.value << "]" << std::endl;
}
}
return 1;
}