1 //把训练语料库转换成crf可接受的格式
 2 //author benbendy
 3 //date 2009 1 2
 4 
 5 #include<iostream>
 6 #include<fstream>
 7 #include<string>
 8 
 9 using namespace std;
10 
11    bool  chinese(unsigned   char   hi,unsigned   char   lo)   
12   {   
13     
14   if(lo<0x81     ||     lo   >0xFE   )   
15   return   false;     
16   if(lo   >=0xA1   &&   lo   <=   0xA9)   
17   return   false;  
18   if(hi   <0x40   ||   hi   ==0xFF   ||   hi   ==   0x7F)   
19   return   false;   
20  return   true;   
21  }
22 
23 int main()
24 {
25      string  str;
26      ifstream cin("in.txt");
27      ofstream cout("out.txt");
28      ofstream fout("training.txt");
29      while(getline(cin,str))
30      {
31       bool flag=true;
32         for(int i=0;i<str.size()-1;i++)
33         {
34                  
35            if(chinese(str[i+1],str[i]))
36               i++;
37            else if(str[i]!=' ')
38                   {
39                     flag=false;
40                     break;
41                   }
42         } 
43        if(!flag)
44           continue;
45         cout<<str<<endl;
46 
47        char s[3];
48        s[2]='\0';
49          bool bef=false;
50         for(int i=0;i<str.size()-1;i++)
51            {
52             if(chinese(str[i+1],str[i]))
53              {
54                   if(!bef )
55                    {
56                      s[0]=str[i];
57                      s[1]=str[i+1];
58                      fout<<s<<" B"<<endl;
59                      bef=true;
60                     }
61                   else{
62                      s[0]=str[i];
63                      s[1]=str[i+1];
64                      fout<<s<<" I"<<endl;
65                      bef=true;
66 
67                    }
68 i++;
69               }
70               else {
71                  bef=false;
72                     }
73          }
74        fout<<endl;
75 }
76 
77 }
78 
79 
80 
81 
82 



下面是把文本直接转换成CRF测试格式

 
12 int main()
 
13 {
 
14       char low=0x81;
 
15       char up=0xfe;
 
16 
 
17     string  str;
 
18      ifstream cin("in.txt");
 
19      ofstream fout("out.txt");
 
20      while(getline(cin,str))
 
21      {
 
22       bool flag=true;
 
23         for(int i=0;i<str.size()-1;i++)
 
24         {
 
25            if(str[i]>=low&&str[i]<=up)
 
26                  {
 
27                   string temp=str.substr(i,2);
 
28                   i++;  
 
29                   if(temp==""||temp==""||temp==""||temp==""||temp==""||temp==""||temp==""||temp==""||temp==">    !")             //如果是各种标点符号或者其他符号
 30                          fout<<endl;
 
31                     else fout<<temp<<endl;
 
32                  }
 
33            else {
 
34                    if(str[i]==','||str[i]=='.')
 
35                         fout<<endl;
 
36              }
 
37        }
 
38       fout<<endl;
 
39 }
 
40 }