本人的分享均来自于实际设计过程中的感悟
不能保证分享成果的正确性,如有错误,请各路大神指出,我会虚心学习,感谢!!!
不能保证分享成果的正确性,如有错误,请各路大神指出,我会虚心学习,感谢!!!
有时候我很好奇,编译器是如何知道我们输入的代码是什么意思的,他是如何把我们的代码编译成二进制可执行文件的呢。今天我们来实现编译器的第一步,一个非常简单的C语言词法分析器。
用于测试的代码文件,hello.c代码如下:
uint a=2147483649,b=321;
double c=111.1;
string str="ABC123n";
int main(int aa,int bb)
{
int x=0,y=3;
a++;
a--;
if(a!=b)
{
a=1;
}
else
{
a=2;
}
printf("ABC %d '" \ 123rn",a);
}
int add(int a1,int a2)
{
return a1+a2;
}
词法分析器的代码如下lexer.h:
#ifndef LEXER_H #define LEXER_H #include#include #include #include #include #include //单词类型 enum TokenType { ID,//关键词 函数 全局变量 关键字 系统函数 NUM, //数字 STRING, //字符串 OP//操作符分割符 }; //单词属性 class Token { public: QStringList TokenType_str={"ID","NUM","STRING","OP"}; QString word;//单词内容 TokenType type;//单词类别 uint line;//单词所在行 Token() { } Token(QString word,TokenType type,uint line) { this->word=word; this->type=type; this->line=line; } void prt() { QString s="%1: %2 > %3 "; s=s.arg(line,5).arg(TokenType_str[type],10).arg(word,10); qDebug()< run(QString code="");//词法解析 }; #endif // LEXER_H
lexer.cpp
#include "lexer.h"
Lexer::Lexer()
{
codestr="";
line=1;
}
Lexer::Lexer(QString code)
{
codestr=code;
line=1;
}
QList Lexer::run(QString code)
{
QList tokens;
if(code.length()>0)
{
codestr=code;
}
if(codestr.length()>0)
{
//开始解析
QByteArray local8Bit = codestr.toLocal8Bit();
char* p=local8Bit.data();//临时指针
char tk=*p;
//遍历字符串
while((tk=*p++)!=' ')
{
//tk代表当前字符 ntk*p代表后一个字符
if(tk=='n'){line++;}//行数统计
else if(tk=='#'){while (*p != 0 && *p != 'n') ++p;}//忽略#关键字,不支持
else if ((tk >= 'a' && tk <= 'z') || (tk >= 'A' && tk <= 'Z') || tk == '_') {//解析ID
QString str(tk);
while ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_')
{
str.append(*p++);
}
Token token(str,TokenType::ID,line);
tokens.append(token);
}
else if (tk >= '0' && tk <= '9') {//解析数字
QString str(tk);
while ((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F')
||*p == 'x' || *p == 'X'
||*p == 'b' || *p == 'B'
||*p == '.' || *p == 'e'
)
{
str.append(*p++);
}
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
else if (tk == '/') {
if (*p == '/') {//忽略注释
++p;
while (*p != 0 && *p != 'n') ++p;
}
else if(*p=='*')//忽略多行注释
{
++p;
while (*p!=' '){
char c1=*p;
char c2=*(p+1);
if(c1=='*' && c2=='/')
{
++p;++p;
break;
}
++p;
}
}
else {
Token token("/",TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == ''' || tk == '"') {
QString str;
while (*p != 0 && *p != tk) {
if(*p=='\')
{
char nc=*++p;//xia'yi'ge下一个字符
if(nc=='n' || nc=='r' || nc=='t' || nc ==''' || nc=='"' || nc=='\')//转义字符
{
if(nc=='n')str.append('n');
if(nc=='r')str.append('r');
if(nc=='t')str.append('t');
if(nc==''')str.append(''');
if(nc=='"')str.append('"');
if(nc=='\')str.append('\');
++p;
}
}
else
{
str.append(*p++);
}
}
++p;
if (tk == '"'){
Token token(str,TokenType::STRING,line);
tokens.append(token);
}
else
{
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
}
else if (tk == '=' || tk == '+' || tk == '-' || tk == '|' || tk == '&')
{
char tk_next = *p;//检查下个字符串是否相同
if (tk_next == tk)
{
p++;
QString str;
str.append(tk);
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '!' || tk == '>' || tk == '<')
{
char tk_next = *p;//检查下个字符串是否相同
if (tk_next == '=')
{
p++;
QString str;
str.append(tk);
str.append(tk_next);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '~' || tk == ';' || tk == '{' || tk == '}' || tk == '(' || tk == ')' || tk == ']' || tk == ',' || tk == ':')
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
}
//打印单词信息
foreach (Token t, tokens) {
t.prt();
}
qDebug()<<"=================================================";
return tokens;
}
在main.c文件中调用词法分析器分析hello.c中的代码:
#include#include #include #include int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); QFile file("./hello.c"); file.open(QFile::ReadOnly); QByteArray localReadAll = file.readAll(); file.close(); QString code=QString::fromUtf8(localReadAll); //开始词法分析 Lexer lx(code); QList tokens = lx.run(); return a.exec(); }
执行后的结果:
" 3: ID > uint "
" 3: ID > a "
" 3: OP > = "
" 3: NUM > 2147483649 "
" 3: OP > , "
" 3: ID > b "
" 3: OP > = "
" 3: NUM > 321 "
" 3: OP > ; "
" 4: ID > double "
" 4: ID > c "
" 4: OP > = "
" 4: NUM > 111.1 "
" 4: OP > ; "
" 5: ID > string "
" 5: ID > str "
" 5: OP > = "
" 5: STRING > ABC123n "
" 5: OP > ; "
" 7: ID > int "
" 7: ID > main "
" 7: OP > ( "
" 7: ID > int "
" 7: ID > aa "
" 7: OP > , "
" 7: ID > int "
" 7: ID > bb "
" 7: OP > ) "
" 8: OP > { "
" 9: ID > int "
" 9: ID > x "
" 9: OP > = "
" 9: NUM > 0 "
" 9: OP > , "
" 9: ID > y "
" 9: OP > = "
" 9: NUM > 3 "
" 9: OP > ; "
" 10: ID > a "
" 10: OP > ++ "
" 10: OP > ; "
" 11: ID > a "
" 11: OP > -- "
" 11: OP > ; "
" 12: ID > if "
" 12: OP > ( "
" 12: ID > a "
" 12: OP > != "
" 12: ID > b "
" 12: OP > ) "
" 13: OP > { "
" 14: ID > a "
" 14: OP > = "
" 14: NUM > 1 "
" 14: OP > ; "
" 15: OP > } "
" 16: ID > else "
" 17: OP > { "
" 18: ID > a "
" 18: OP > = "
" 18: NUM > 2 "
" 18: OP > ; "
" 19: OP > } "
" 20: ID > printf "
" 20: OP > ( "
" 20: STRING > ABC %d '" \ 123rn "
" 20: OP > , "
" 20: ID > a "
" 20: OP > ) "
" 20: OP > ; "
" 21: OP > } "
" 25: ID > int "
" 25: ID > add "
" 25: OP > ( "
" 25: ID > int "
" 25: ID > a1 "
" 25: OP > , "
" 25: ID > int "
" 25: ID > a2 "
" 25: OP > ) "
" 26: OP > { "
" 27: ID > return "
" 27: ID > a1 "
" 27: OP > + "
" 27: ID > a2 "
" 27: OP > ; "
" 28: OP > } "
=================================================
可以看出,词法分析器,把代码中的关键字,操作符,字符串,分割符等都分离出来了,当然这是一个超级简单的词法分析器,功能并不完善,只是为了让我们了解编译过程中,编译器做的哪些工作。



