The phase between lexical analyzer and semantic analyzer.
Also called as synthesizing. Get rid of from punctitions or other things.
We need a tool to express syntax which we called as Context Free Grammer.
:::warning
* Token: functionality of the lexeme = tADD
, tMULT
...
* Lexeme: +
is a lexeme for tADD
:::
if (k) {
i ++;
}
If we write a lexical analyzer our program will generate these tokens:
tIF
tLBR
tIDENT
tRBR
tLCR
tIDENT
tPLUS
tPLUS
tRCR
Token | Lexeme |
---|---|
tIF | if |
tLBR | ( |
tIDENT | variable... |
There some important facts about lexemes and tokens which are:
Regex is a sequence of characters that specifies a search pattern [wiki].
a
mathces a
[123]
mathces 1 or 2 or 3
[1-8]
mathces 1 or 2 ....
[a-z]
matches a,b ..
[a-zA-Z]
matches any letter[^a-zA-Z]
matches any non-letterA+
matches A,AA ...
A*
matches ,A,AA ...
[a-zA-Z]*
any string[0-9]+
any number[a-zA-Z]{9}
matches any 9 character string[a-zA-Z]{2,}
matches any string len(str) > 2 -?
zero or one repeated Flex stands for fast lexical analyzer.
%%
[a-z]* return tLSTR;
[A-Z]* return tBSTR;
[0-9]* return tNUM;
<<EOF>> return 0;
.
%%
main() {
yylex();
}
:::border
flex scanner.flx
gcc lex.yy.c -lfl
:::
yytext
: String that matches with regular expressionyyleng
: Length of string that matchesREJECT
: Reject match go downINITIAL
.%x comment
%%
"/*" BEGIN(comment);
<comment>"*/" BEGIN(INITIAL);
%%
int noOfLines = 1;
int search_pos(char * str, char a){
int k;
for (k = 0; k < strlen(str); k++) {
if(str[k] == a){
return k;
}
}
}
%x skipstr
DIGIT [0-9]
NONZERODIGIT [1-9]
CAPLETTER [A-Z]
LOWLETTER [a-z]
LETTER [{CAPLETTER} {LOWLETTER}]
STRING [{DIGIT}{LETTER}]
%%
\n noOfLines++;
\"([^\"]?)*\"
{
int k;
yytext++;
yytext[strlen(yytext)-1] = '\0';
printf("%d tSTRING (%s)\n",noOfLines,yytext);
for (k = 0; k < strlen(yytext); k++) {
if(yytext[k] == '\n'){
noOfLines ++;
}
}
}
Mail printf("%d tMAIL\n",noOfLines);
schedule printf("%d tSCHEDULE\n",noOfLines);
send printf("%d tSEND\n",noOfLines);
from printf("%d tFROM\n",noOfLines);
to printf("%d tTO\n",noOfLines);
set printf("%d tSET\n",noOfLines);
end\ Mail printf("%d tENDMAIL\n",noOfLines);
end\ schedule printf("%d tENDSCH\n",noOfLines);
[a-zA-Z0-9\.\-\_]+@([a-zA-Z0-9-]+\.){1,2}[a-zA-Z0-9]+
{
int atpos = search_pos(yytext,'@');
if(yytext[0] == '.' || yytext[atpos-1] == '.' || yytext[atpos+1] == '-'){
REJECT;
}
int k;
for(k = 0; k < atpos; k++ ){
if(yytext[k] == '.' && yytext[k+1] == '.'){
REJECT;
}
}
int l;
for(l = atpos; l < strlen(yytext); l++ ){
if(yytext[l] == '.'){
if(yytext[l-1] == '-' || yytext[l+1] == '-'){
REJECT;
}
}
}
printf("%d tADDRESS (%s)\n",noOfLines, yytext);
}
[0-9]{2}\/[0-9]{2}\/[0-9]{4} printf("%d tDATE (%s)\n",noOfLines,yytext);
[0-9]{2}:[0-9]{2} printf("%d tTIME (%s)\n",noOfLines,yytext);
[a-zA-Z_][a-zA-Z0-9_]* printf("%d tIDENT (%s)\n",noOfLines,yytext);
, printf("%d tCOMMA\n",noOfLines);
\: printf("%d tCOLON\n",noOfLines);
\( printf("%d tLPR\n",noOfLines);
\) printf("%d tRPR\n",noOfLines);
\[ printf("%d tLBR\n",noOfLines);
\] printf("%d tRBR\n",noOfLines);
\@ printf("%d tAT\n",noOfLines);
<<EOF>> return 0;
[^ ] printf("%d ILLEGAL CHARACTER (%s)\n",noOfLines,yytext);
.
%%
main() {
yylex();
}
%{
#include "hw2.tab.h"
%}
tLOCALPART [A-Za-z0-9\-_]+(([A-Za-z0-9\-_]*|\.?)[A-Za-z0-9\-_])*
tDOMAIN [A-Za-z0-9]+[A-Za-z0-9\-]*[A-Za-z0-9]+|[A-Za-z0-9]{1}
%%
"Mail" return tMAIL;
"end Mail" return tENDMAIL;
"schedule" return tSCHEDULE;
"end schedule" return tENDSCH;
"send" return tSEND;
"set" return tSET;
"to" return tTO;
"from" return tFROM;
@ return tAT;
, return tCOMMA;
: return tCOLON;
\( return tLPR;
\) return tRPR;
\[ return tLBR;
\] return tRBR;
[a-zA-Z\_]+[a-zA-Z0-9\_]* return tIDENT;
["][^\"]*["] return tSTRING;
[0-9]{2}"/"[0-9]{2}"/"[0-9]{4} return tDATE;
[0-9]{2}:[0-9]{2} return tTIME;
{tLOCALPART}"@"({tDOMAIN}"."{tDOMAIN}"."{tDOMAIN}|{tDOMAIN}"."{tDOMAIN}) return tADDRESS;
[ \t\n]+
. return yytext[0];
%%