升C小调狂想曲

<递归的忧伤>
posts - 10, comments - 71, trackbacks - 0, articles - 0
   :: 首页 :: 新随笔 :: 联系 :: 聚合  :: 管理
首先祝贺北京奥运会精彩开幕!!我亲眼见到了鸟巢的壮观景象,跟一堆外国人在一起看开幕式时我第一次感觉这么自豪!加油奥运,加油中国!

今天把词法分析器写好了,一个巨大的switch。代码如下:

//CNScriptLex.h
 1 #ifndef CNSCRIPTLEX_H
 2 #define CNSCRIPTLEX_H
 3 
 4 #include "..\..\..\System\System.h"
 5 
 6 
 7 namespace CNScript_1
 8 {
 9 
10     namespace CNScriptLexTokenType
11     {
12         enum ECNScriptLexTokenType
13         {
14             None,
15             Keyword_if,
16             Keyword_else,
17             Keyword_for,
18             Keyword_do,
19             Keyword_while,
20             Keyword_return,
21             Identifier,
22             TypeName,
23             IntegerValue,
24             DoubleValue,
25             Operator,
26             Separator,
27             Space,
28             Comment,
29             BoolValue,
30             CharValue,
31             StringValue,
32             BigLeftBracket,
33             BigRightBracket,
34             MidLeftBracket,
35             MidRightBracket,
36             SmallLeftBracket,
37             SmallRightBracket,
38             Error
39         };
40     }
41 
42 
43     class CNScriptLexToken
44     {
45     public:
46         CNScriptLexToken();
47 
48         System::String ToString();
49 
50         CNScriptLexTokenType::ECNScriptLexTokenType    Type;
51         System::String                                Value;
52         int                                            LineNumber;
53         int                                            ColumnNumber;
54 
55     private:
56         System::String GetTypeString();
57     };
58 
59     
60     class CNScriptLexParser
61     {
62     public:
63         CNScriptLexParser();
64 
65         System::Collections::Generic::List<CNScriptLexToken>    Parse(const System::String& scriptCode);
66 
67     private:
68         int currInput;
69         int currLine;
70         int currCol;
71         int currState;
72         int eaten;
73         int currTokenStartLine;
74         int currTokenStartCol;
75         int codeLength;
76         System::String code;
77         const wchar_t* pcode;
78         System::Collections::Generic::List<CNScriptLexToken> tokens;
79 
80         void advance();
81         void accept(CNScriptLexTokenType::ECNScriptLexTokenType tokenType);
82         void error();
83         System::String tryGetValue();
84     };
85 
86 }
87 
88 #endif


//CNScriptLex.cpp
  1 #include "CNScriptLex.h"
  2 
  3 using namespace System;
  4 using namespace System::Collections::Generic;
  5 
  6 namespace CNScript_1
  7 {
  8     CNScriptLexToken::CNScriptLexToken()
  9     {
 10         Type = CNScriptLexTokenType::None;
 11         LineNumber = 0;
 12         ColumnNumber = -1;
 13     }
 14 
 15     String CNScriptLexToken::GetTypeString()
 16     {
 17         switch(Type)
 18         {
 19         case CNScriptLexTokenType::BigLeftBracket:
 20             return L"BLBracket";
 21         case CNScriptLexTokenType::BigRightBracket:
 22             return L"BRBracket";
 23         case CNScriptLexTokenType::BoolValue:
 24             return L"BoolValue";
 25         case CNScriptLexTokenType::CharValue:
 26             return L"CharValue";
 27         case CNScriptLexTokenType::Comment:
 28             return L"Comment";
 29         case CNScriptLexTokenType::Identifier:
 30             return L"Identifier";
 31         case CNScriptLexTokenType::Keyword_do:
 32             return L"Keyword_do";
 33         case CNScriptLexTokenType::Keyword_else:
 34             return L"Keyword_else";
 35         case CNScriptLexTokenType::Keyword_for:
 36             return L"Keyword_for";
 37         case CNScriptLexTokenType::Keyword_if:
 38             return L"Keyword_if";
 39         case CNScriptLexTokenType::Keyword_return:
 40             return L"Keyword_return";
 41         case CNScriptLexTokenType::Keyword_while:
 42             return L"Keyword_while";
 43         case CNScriptLexTokenType::MidLeftBracket:
 44             return L"MLBracket";
 45         case CNScriptLexTokenType::MidRightBracket:
 46             return L"MRBracket";
 47         case CNScriptLexTokenType::None:
 48             return L"None";
 49         case CNScriptLexTokenType::IntegerValue:
 50             return L"IntegerValue";
 51         case CNScriptLexTokenType::DoubleValue:
 52             return L"DoubleValue";
 53         case CNScriptLexTokenType::Operator:
 54             return L"Operator";
 55         case CNScriptLexTokenType::Separator:
 56             return L"Separator";
 57         case CNScriptLexTokenType::SmallLeftBracket:
 58             return L"SLBracket";
 59         case CNScriptLexTokenType::SmallRightBracket:
 60             return L"SRBracket";
 61         case CNScriptLexTokenType::Space:
 62             return L"Space";
 63         case CNScriptLexTokenType::StringValue:
 64             return L"StringValue";
 65         case CNScriptLexTokenType::TypeName:
 66             return L"TypeName";
 67         case CNScriptLexTokenType::Error:
 68             return L"Error";
 69         default:
 70             return L"None";
 71         }
 72     }
 73 
 74     String CNScriptLexToken::ToString()
 75     {
 76         String result = L"[TokenType:" + GetTypeString() + L"\tLine:" + String::ToString(LineNumber) + L"\tColumn:" + String::ToString(ColumnNumber) + L"\tValue:" + Value + L"]";
 77         result = result.Replace(L"\r\n", L"\\r\\n");
 78 
 79         return result;
 80     }
 81 
 82     CNScriptLexParser::CNScriptLexParser()
 83     {
 84     }
 85 
 86     void CNScriptLexParser::advance()
 87     {
 88         wchar_t prevChar = pcode[currInput];
 89 
 90         if(prevChar == L'\n')
 91         {
 92             currLine    ++;
 93             currCol        = 1;
 94             currInput    ++;
 95         }
 96         else
 97         {
 98             currCol        ++;
 99             currInput    ++;
100         }
101     }
102 
103     String CNScriptLexParser::tryGetValue()
104     {
105         int newTokenLength = currInput - eaten;
106 
107         return code.SubString(eaten, newTokenLength);
108     }
109 
110     void CNScriptLexParser::accept(CNScriptLexTokenType::ECNScriptLexTokenType tokenType)
111     {
112         int newTokenLength = currInput - eaten;
113 
114         CNScriptLexToken newToken;
115         newToken.LineNumber = currTokenStartLine;
116         newToken.ColumnNumber = currTokenStartCol;
117         newToken.Type = tokenType;
118         newToken.Value = code.SubString(eaten, newTokenLength);
119 
120         tokens.Add(newToken);
121 
122         eaten += newTokenLength;
123         currTokenStartLine = currLine;
124         currTokenStartCol = currCol;
125         currState = 0;
126     }
127 
128     void CNScriptLexParser::error()
129     {
130         accept(CNScriptLexTokenType::Error);
131         currInput = codeLength + 1;
132     }
133 
134     List<CNScriptLexToken> CNScriptLexParser::Parse(const System::String &scriptCode)
135     {
136         currInput    = 0;
137         currLine    = 1;
138         currCol        = 1;
139         currState    = 0;
140         eaten        = 0;
141         code        = scriptCode;
142         pcode        = scriptCode.Buffer();
143         codeLength    = scriptCode.Length();
144 
145         currTokenStartLine    = 1;
146         currTokenStartCol    = 1;
147         
148         tokens.Clear();
149 
150         while(currInput <= codeLength)
151         {
152             wchar_t ch = pcode[currInput];
153 
154             switch(currState)
155             {
156             case 0:
157                 {
158                     if(ch == 0)
159                     {
160                         advance();
161                     }
162                     else if(ch == L';')
163                     {
164                         currState = 1;
165                         advance();
166                     }
167                     else if(ch == L'/')
168                     {
169                         currState = 2;
170                         advance();
171                     }
172                     else if(ch == L' ' || ch == L'\r' || ch == L'\n' || ch == L'\t')
173                     {
174                         currState = 4;
175                         advance();
176                     }
177                     else if(ch == L'{')
178                     {
179                         currState = 5;
180                         advance();
181                     }
182                     else if(ch == L'}')
183                     {
184                         currState = 6;
185                         advance();
186                     }
187                     else if(ch == L'[')
188                     {
189                         currState = 7;
190                         advance();
191                     }
192                     else if(ch == L']')
193                     {
194                         currState = 8;
195                         advance();
196                     }
197                     else if(ch == L'(')
198                     {
199                         currState = 9;
200                         advance();
201                     }
202                     else if(ch == L')')
203                     {
204                         currState = 10;
205                         advance();
206                     }
207                     else if(ch == L'0')
208                     {
209                         currState = 11;
210                         advance();
211                     }
212                     else if(ch >= L'1' && ch <= L'9')
213                     {
214                         currState = 14;
215                         advance();
216                     }
217                     else if(ch == L'\'')
218                     {
219                         currState = 17;
220                         advance();
221                     }
222                     else if(ch == L'"')
223                     {
224                         currState = 23;
225                         advance();
226                     }
227                     else if(ch == L'=')
228                     {
229                         currState = 26;
230                         advance();
231                     }
232                     else if(ch == L'+')
233                     {
234                         currState = 28;
235                         advance();
236                     }
237                     else if(ch == L'-')
238                     {
239                         currState = 30;
240                         advance();
241                     }
242                     else if(ch == L'!')
243                     {
244                         currState = 32;
245                         advance();
246                     }
247                     else if(ch == L'|')
248                     {
249                         currState = 34;
250                         advance();
251                     }
252                     else if(ch == L'&')
253                     {
254                         currState = 36;
255                         advance();
256                     }
257                     else if(ch == L'*')
258                     {
259                         currState = 38;
260                         advance();
261                     }
262                     else if(ch == L'<')
263                     {
264                         currState = 39;
265                         advance();
266                     }
267                     else if(ch == L'>')
268                     {
269                         currState = 41;
270                         advance();
271                     }
272                     else if(ch == L'_')
273                     {
274                         currState = 43;
275                         advance();
276                     }
277                     else if((ch >= L'a' && ch <= L'z'|| (ch >= L'A' && ch <= L'Z'))
278                     {
279                         currState = 44;
280                         advance();
281                     }
282                     else
283                     {
284                         error();
285                     }
286                 }
287                 break;
288             case 1:
289                 {
290                     accept(CNScriptLexTokenType::Separator);
291                 }
292                 break;
293             case 2:
294                 {
295                     if(ch == L'/')
296                     {
297                         currState = 3;
298                         advance();
299                     }
300                     else
301                     {
302                         accept(CNScriptLexTokenType::Operator);
303                     }
304                 }
305                 break;
306             case 3:
307                 {
308                     if(ch != L'\r' && ch != L'\n' && ch != 0)
309                     {
310                         advance();
311                     }
312                     else
313                     {
314                         accept(CNScriptLexTokenType::Comment);
315                     }
316                 }
317                 break;
318             case 4:
319                 {
320                     if(ch == L' ' || ch == L'\r' || ch == L'\n' || ch == L'\t')
321                     {
322                         advance();
323                     }
324                     else
325                     {
326                         accept(CNScriptLexTokenType::Space);
327                     }
328                 }
329                 break;
330             case 5:
331                 {
332                     accept(CNScriptLexTokenType::BigLeftBracket);
333                 }
334                 break;
335             case 6:
336                 {
337                     accept(CNScriptLexTokenType::BigRightBracket);
338                 }
339                 break;
340             case 7:
341                 {
342                     accept(CNScriptLexTokenType::MidLeftBracket);
343                 }
344                 break;
345             case 8:
346                 {
347                     accept(CNScriptLexTokenType::MidRightBracket);
348                 }
349                 break;
350             case 9:
351                 {
352                     accept(CNScriptLexTokenType::SmallLeftBracket);
353                 }
354                 break;
355             case 10:
356                 {
357                     accept(CNScriptLexTokenType::SmallRightBracket);
358                 }
359                 break;
360             case 11:
361                 {
362                     if(ch == L'.')
363                     {
364                         currState = 12;
365                         advance();
366                     }
367                     else
368                     {
369                         accept(CNScriptLexTokenType::IntegerValue);
370                     }
371                 }
372                 break;
373             case 12:
374                 {
375                     if(ch >= L'0' && ch <= L'9')
376                     {
377                         currState = 13;
378                         advance();
379                     }
380                     else
381                     {
382                         error();
383                     }
384                 }
385                 break;
386             case 13:
387                 {
388                     if(ch >= L'0' && ch <= L'9')
389                     {
390                         advance();
391                     }
392                     else
393                     {
394                         accept(CNScriptLexTokenType::DoubleValue);
395                     }
396                 }
397                 break;
398             case 14:
399                 {
400                     if(ch >= L'0' && ch <= L'9')
401                     {
402                         advance();
403                     }
404                     else if(ch == L'.')
405                     {
406                         currState = 15;
407                         advance();
408                     }
409                     else
410                     {
411                         accept(CNScriptLexTokenType::IntegerValue);
412                     }
413                 }
414                 break;
415             case 15:
416                 {
417                     if(ch >= L'0' && ch <= L'9')
418                     {
419                         currState = 16;
420                         advance();
421                     }
422                     else
423                     {
424                         error();
425                     }
426                 }
427                 break;
428             case 16:
429                 {
430                     if(ch >= L'0' && ch <= L'9')
431                     {
432                         advance();
433                     }
434                     else
435                     {
436                         accept(CNScriptLexTokenType::DoubleValue);
437                     }
438                 }
439                 break;
440             case 17:
441                 {
442                     if(ch == L'\\')
443                     {
444                         currState = 18;
445                         advance();
446                     }
447                     else if(ch != L'\'' && ch != L'\r' && ch != L'\n' && ch != L'\t' && ch != 0)
448                     {
449                         currState = 21;
450                         advance();
451                     }
452                     else
453                     {
454                         error();
455                     }
456                 }
457                 break;
458             case 18:
459                 {
460                     if(ch != L' ' && ch != L'\r' && ch != L'\n' && ch != L'\t' && ch != 0)
461                     {
462                         currState = 19;
463                         advance();
464                     }
465                     else
466                     {
467                         error();
468                     }
469                 }
470                 break;
471             case 19:
472                 {
473                     if(ch == L'\'')
474                     {
475                         currState = 20;
476                         advance();
477                     }
478                     else
479                     {
480                         error();
481                     }
482                 }
483                 break;
484             case 20:
485                 {
486                     accept(CNScriptLexTokenType::CharValue);
487                 }
488                 break;
489             case 21:
490                 {
491                     if(ch == L'\'')
492                     {
493                         currState = 22;
494                         advance();
495                     }
496                     else
497                     {
498                         error();
499                     }
500                 }
501                 break;
502             case 22:
503                 {
504                     accept(CNScriptLexTokenType::CharValue);
505                 }
506                 break;
507             case 23:
508                 {
509                     if(ch != L'"' && ch != L'\\' && ch != L'\r' && ch != L'\n' && ch != 0)
510                     {
511                         advance();
512                     }
513                     else if(ch == L'\\')
514                     {
515                         currState = 24;
516                         advance();
517                     }
518                     else if(ch == L'"')
519                     {
520                         currState = 25;
521                         advance();
522                     }
523                     else
524                     {
525                         error();
526                     }
527                 }
528                 break;
529             case 24:
530                 {
531                     if(ch != L' ' && ch != L'\r' && ch != L'\n' && ch != L'\t' && ch != 0)
532                     {
533                         currState = 23;
534                         advance();
535                     }
536                     else
537                     {
538                         error();
539                     }
540                 }
541                 break;
542             case 25:
543                 {
544                     accept(CNScriptLexTokenType::StringValue);
545                 }
546                 break;
547             case 26:
548                 {
549                     if(ch == L'=')
550                     {
551                         currState = 27;
552                         advance();
553                     }
554                     else
555                     {
556                         accept(CNScriptLexTokenType::Operator);
557                     }
558                 }
559                 break;
560             case 27:
561                 {
562                     accept(CNScriptLexTokenType::Operator);
563                 }
564                 break;
565             case 28:
566                 {
567                     if(ch == L'+')
568                     {
569                         currState = 29;
570                         advance();
571                     }
572                     else
573                     {
574                         accept(CNScriptLexTokenType::Operator);
575                     }
576                 }
577                 break;
578             case 29:
579                 {
580                     accept(CNScriptLexTokenType::Operator);
581                 }
582                 break;
583             case 30:
584                 {
585                     if(ch == L'-')
586                     {
587                         currState = 31;
588                         advance();
589                     }
590                     else
591                     {
592                         accept(CNScriptLexTokenType::Operator);
593                     }
594                 }
595                 break;
596             case 31:
597                 {
598                     accept(CNScriptLexTokenType::Operator);
599                 }
600                 break;
601             case 32:
602                 {
603                     if(ch == L'=')
604                     {
605                         currState = 33;
606                         advance();
607                     }
608                     else
609                     {
610                         accept(CNScriptLexTokenType::Operator);
611                     }
612                 }
613                 break;
614             case 33:
615                 {
616                     accept(CNScriptLexTokenType::Operator);
617                 }
618                 break;
619             case 34:
620                 {
621                     if(ch == L'|')
622                     {
623                         currState = 35;
624                         advance();
625                     }
626                     else
627                     {
628                         error();
629                     }
630                 }
631                 break;
632             case 35:
633                 {
634                     accept(CNScriptLexTokenType::Operator);
635                 }
636                 break;
637             case 36:
638                 {
639                     if(ch == L'&')
640                     {
641                         currState = 37;
642                         advance();
643                     }
644                     else
645                     {
646                         error();
647                     }
648                 }
649                 break;
650             case 37:
651                 {
652                     accept(CNScriptLexTokenType::Operator);
653                 }
654                 break;
655             case 38:
656                 {
657                     accept(CNScriptLexTokenType::Operator);
658                 }
659                 break;
660             case 39:
661                 {
662                     if(ch == L'=')
663                     {
664                         currState = 40;
665                         advance();
666                     }
667                     else
668                     {
669                         accept(CNScriptLexTokenType::Operator);
670                     }
671                 }
672                 break;
673             case 40:
674                 {
675                     accept(CNScriptLexTokenType::Operator);
676                 }
677                 break;
678             case 41:
679                 {
680                     if(ch == L'=')
681                     {
682                         currState = 42;
683                         advance();
684                     }
685                     else
686                     {
687                         accept(CNScriptLexTokenType::Operator);
688                     }
689                 }
690                 break;
691             case 42:
692                 {
693                     accept(CNScriptLexTokenType::Operator);
694                 }
695                 break;
696             case 43:
697                 {
698                     if(ch == L'_')
699                     {
700                         advance();
701                     }
702                     else if((ch >= L'a' && ch <= 'z'|| (ch >= L'A' && ch <= L'Z'|| (ch >= L'0' && ch <= L'9'))
703                     {
704                         currState = 44;
705                         advance();
706                     }
707                     else
708                     {
709                         error();
710                     }
711                 }
712                 break;
713             case 44:
714                 {
715                     if((ch >= L'a' && ch <= 'z'|| (ch >= L'A' && ch <= L'Z'|| (ch >= L'0' && ch <= L'9'|| ch == L'_')
716                     {
717                         advance();
718                     }
719                     else
720                     {
721                         String tokenValue = tryGetValue();
722                         CNScriptLexTokenType::ECNScriptLexTokenType thisTokenType;
723 
724                         if(tokenValue == L"if")
725                         {
726                             thisTokenType = CNScriptLexTokenType::Keyword_if;
727                         }
728                         else if(tokenValue == L"else")
729                         {
730                             thisTokenType = CNScriptLexTokenType::Keyword_else;
731                         }
732                         else if(tokenValue == L"for")
733                         {
734                             thisTokenType = CNScriptLexTokenType::Keyword_for;
735                         }
736                         else if(tokenValue == L"do")
737                         {
738                             thisTokenType = CNScriptLexTokenType::Keyword_do;
739                         }
740                         else if(tokenValue == L"while")
741                         {
742                             thisTokenType = CNScriptLexTokenType::Keyword_while;
743                         }
744                         else if(tokenValue == L"return")
745                         {
746                             thisTokenType = CNScriptLexTokenType::Keyword_return;
747                         }
748                         else if(    tokenValue == L"int" ||
749                                     tokenValue == L"double" ||
750                                     tokenValue == L"char" ||
751                                     tokenValue == L"string" ||
752                                     tokenValue == L"bool" ||
753                                     tokenValue == L"void")
754                         {
755                             thisTokenType = CNScriptLexTokenType::TypeName;
756                         }
757                         else
758                         {
759                             thisTokenType = CNScriptLexTokenType::Identifier;
760                         }
761 
762                         accept(thisTokenType);
763                     }
764                 }
765                 break;
766             }
767         }
768 
769         return tokens;
770     }
771 }


//Program.cpp
 1 #include "..\..\System\System.h"
 2 #include "CNScript\CNScript.h"
 3 
 4 using namespace System;
 5 using namespace System::IO;
 6 using namespace System::Collections::Generic;
 7 using namespace CNScript_1;
 8 
 9 int Program(const String& arg)
10 {
11     System::Windows::Forms::Application::RunConsoleApplication();
12 
13     CNScriptLexParser parser;
14 
15     Console::WriteLine(L"---------CNScript Lexical Parser---------");
16 
17     Console::Write(L"\r\nInput the code file name:");
18     String codeFileName = Console::Read();
19 
20     StreamReader reader(codeFileName);
21     String code = reader.ReadToEnd();
22     reader.Close();
23 
24     Console::WriteLine(L"\r\nTesting code:");
25     Console::WriteLine(code);
26 
27     Console::Write(L"\r\nPress [Enter] to start testing\r\n");
28     Console::Read();
29 
30     Int64 startTime = GetTickCount();
31     List<CNScriptLexToken> tokens = parser.Parse(code);
32     Int64 endTime = GetTickCount();
33 
34     ListIterator<CNScriptLexToken> tokenIter(tokens);
35 
36     while(tokenIter.Foreach())
37     {
38         Console::WriteLine(tokenIter.Item.ToString());
39     }
40 
41     Console::WriteLine(L"\r\nTime Cost:" + String::ToString(endTime - startTime) + L" ms");
42 
43     return 0;
44 }

 


 运行结果:

Feedback

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 02:53 by 陈梓瀚(vczh)
囧,还是趁早写个DFA搞定吧。你上次都弄了一个正则表达式,现在倒复古了……

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 02:55 by 陈梓瀚(vczh)
你这个parser是不可重入的。

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 11:49 by 陈坤
俺这个是CaiNiaoScript,注重的就是一个“菜”字嘛,主要是培养感情和熟悉流程的。真要写个flex的话也只是时间问题嘛,嘿嘿。

你说不可重入指的是什么?

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 18:34 by 沈臻豪(foxtail)
你在北京哈 真没想到

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 18:49 by 陈梓瀚(vczh)
培养感情,囧……

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 19:34 by 陈坤
@沈臻豪(foxtail)

我当然在北京呀,上次告诉过你的嘛。公司不给上QQ和Hi,只能上MSN,囧。

# re: 第一行代码 --- CNScript 成长日记(4)  回复  更多评论   

2008-08-09 22:39 by 沈臻豪(foxtail)
我msn是greenfoxtail@live.cn 加一个@陈坤

只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   知识库   博问   管理