先贴在
http://cn.minidx.com/index.php?option=com_content&task=view&id=91&Itemid=9 了
声明:可任意转载,复制,修改,以及用于任何您所希望的目的而与作者无关。
Minidx的IFilter Com组件中的一个字符串处理函数,过滤掉一些特殊字符,转为标准的ASCII字符.因为要处理包括欧洲以及阿拉伯等其他一些特殊字符,所以case比较多,请结合自己的需要选择性的修改使用,:)
1 // Valid characters are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
2 // [#x10000-#x10FFFF]
3
4 nline static void ValidUnicode(wchar_t & ch)
5 {
6 if (ch < 0x0020) // if less than ASCII space
7 {
8 if ((ch == 0x000D) // CR
9 || (ch == 0x000A) // or LF
10 || (ch == 0x0009)) // or TAB
11 return; // it's valid!
12 else
13 ch = L' '; // morph to blank
14 }
15 else if (ch > 0x007e) // or greater than ASCII '~'
16 {
17 if (ch <= 0xD7FF)
18 return; // it's valid!
19 else if (ch >= 0xF8FF && ch <= 0xFFFD)
20 return; // it's valid!
21 else
22 ch = L' '; // morph to blank
23
24 // note we don't support surrogates, private use or high-Unicode 0x10000-0x10FFFF characters
25 }
26 else
27 return; // it's valid!
28 }
29
30 static void CleanUpCharacters(size_t chBuf, wchar_t *buf)
31 {
32 // simplified form to make parsing easier.
33
34 buf[chBuf] = 0; // must be null terminated..
35
36 for (size_t i = 0; i < chBuf; ++i)
37 {
38 wchar_t & ch = buf[i];
39
40 switch (ch)
41 {
42 case 0: // embedded null
43 case 0x2000: // en quad
44 case 0x2001: // em quad
45 case 0x2002: // en space
46 case 0x2003: // em space
47 case 0x2004: // three-per-em space
48 case 0x2005: // four-per-em space
49 case 0x2006: // six-per-em space
50 case 0x2007: // figure space
51 case 0x2008: // puctuation space
52 case 0x2009: // thin space
53 case 0x200A: // hair space
54 case 0x200B: // zero-width space
55 case 0x200C: // zero-width non-joiner
56 case 0x200D: // zero-width joiner
57 case 0x202f: // no-break space
58 case 0x3000: // ideographic space
59 ch = L' ';
60 break;
61
62 case 0x00B6: // pilcro
63 case 0x2028: // line seperator
64 case 0x2029: // paragraph seperator
65 ch = L'\n';
66 break;
67
68 case 0x00AD: // soft-hyphen
69 case 0x00B7: // middle dot
70 case 0x2010: // hyphen
71 case 0x2011: // non-breaking hyphen
72 case 0x2012: // figure dash
73 case 0x2013: // en dash
74 case 0x2014: // em dash
75 case 0x2015: // quote dash
76 case 0x2027: // hyphenation point
77 case 0x2043: // hyphen bullet
78 case 0x208B: // subscript minus
79 case 0xFE31: // vertical em dash
80 case 0xFE32: // vertical en dash
81 case 0xFE58: // small em dash
82 case 0xFE63: // small hyphen minus
83 case 0xFF0D: // 2007/05/08 TEI zenkaku -
84 ch = L'-';
85 break;
86
87 case 0x00B0: // degree
88 case 0x2018: // left single quote
89 case 0x2019: // right single quote
90 case 0x201A: // low right single quote
91 case 0x201B: // high left single quote
92 case 0x2032: // prime
93 case 0x2035: // reversed prime
94 case 0x2039: // left-pointing angle quotation mark
95 case 0x203A: // right-pointing angle quotation mark
96 ch = L'\'';
97 break;
98
99 case 0x201C: // left double quote
100 case 0x201D: // right double quote
101 case 0x201E: // low right double quote
102 case 0x201F: // high left double quote
103 case 0x2033: // double prime
104 case 0x2034: // triple prime
105 case 0x2036: // reversed double prime
106 case 0x2037: // reversed triple prime
107 case 0x00AB: // left-pointing double angle quotation mark
108 case 0x00BB: // right-pointing double angle quotation mark
109 case 0x3003: // ditto mark
110 case 0x301D: // reversed double prime quotation mark
111 case 0x301E: // double prime quotation mark
112 case 0x301F: // low double prime quotation mark
113 ch = L'\"';
114 break;
115
116 case 0x00A7: // section-sign
117 case 0x2020: // dagger
118 case 0x2021: // double-dagger
119 case 0x2022: // bullet
120 case 0x2023: // triangle bullet
121 case 0x203B: // reference mark
122 case 0xFE55: // small colon
123 case 0xFF1A: // chinese : 65306
124 ch = L':';
125 break;
126
127 case 0x2024: // one dot leader
128 case 0x2025: // two dot leader
129 case 0x2026: // elipsis
130 case 0x3002: // ideographic full stop
131 case 0xFE30: // two dot vertical leader
132 case 0xFE52: // small full stop
133 case 0x30FB: // 2007/05/08 ADD-- zenkaku .
134 case 0xFF0E: // 2007/05/10 ADD-- JP .
135 ch = L'.';
136 break;
137
138 case 0x3001: // ideographic comma
139 case 0xFE50: // small comma
140 case 0xFE51: // small ideographic comma
141 case 0xFF0C: // chinese 65292
142 ch = L',';
143 break;
144
145 case 0xFE54: // small semicolon
146 case 0xFF1B: // 2007/05/08 ADD
147 ch = L';';
148 break;
149
150 case 0x00A6: // broken-bar
151 case 0x2016: // double vertical line
152 ch = L'|';
153 break;
154
155 case 0x2017: // double low line
156 case 0x203E: // overline
157 case 0x203F: // undertie
158 case 0x2040: // character tie
159 case 0xFE33: // vertical low line
160 case 0xFE49: // dashed overline
161 case 0xFE4A: // centerline overline
162 case 0xFE4D: // dashed low line
163 case 0xFE4E: // centerline low line
164 ch = L'_';
165 break;
166
167 case 0x301C: // wave dash
168 case 0x3030: // wavy dash
169 case 0xFE34: // vertical wavy low line
170 case 0xFE4B: // wavy overline
171 case 0xFE4C: // double wavy overline
172 case 0xFE4F: // wavy low line
173 case 0xFF5E: // 2007/04/09 TEI zenkaku
174 ch = L'~';
175 break;
176
177 case 0x2038: // caret
178 case 0x2041: // caret insertion point
179 ch = L'^';
180 break;
181
182 case 0x2030: // per-mille
183 case 0x2031: // per-ten thousand
184 case 0xFE6A: // small per-cent
185 case 0xFF05: // Asia per-cent
186 ch = L'%';
187 break;
188
189 case 0xFE6B: // small commercial at
190 ch = L'@';
191 break;
192
193 case 0x00A9: // copyright
194 ch = L'c';
195 break;
196
197 case 0x00B5: // micro
198 ch = L'u';
199 break;
200
201 case 0x00AE: // registered
202 ch = L'r';
203 break;
204
205 case 0x207A: // superscript plus
206 case 0x208A: // subscript plus
207 case 0xFE62: // small plus
208 case 0xFF0B: // 2007/05/13 TEI ---- zenkaku +
209 ch = L'+';
210 break;
211
212 case 0x2044: // fraction slash
213 ch = L'/';
214 break;
215
216 case 0x2042: // asterism
217 case 0xFE61: // small asterisk
218 case 0xFF0A: // Asia asterisk
219 ch = L'*';
220 break;
221
222 case 0x208C: // subscript equal
223 case 0xFE66: // small equal
224 ch = L'=';
225 break;
226
227 case 0xFE68: // small reverse solidus
228 ch = L'\\';
229 break;
230
231 case 0xFE5F: // small number sign
232 case 0xFF03: // Asia number sign
233 ch = L'#';
234 break;
235
236 case 0xFE60: // small ampersand
237 case 0xFF06: // Asia ampersand
238 ch = L'&';
239 break;
240
241 case 0xFE69: // small dollar sign
242 case 0xFF04: // Asia dollar sign
243 ch = L'$';
244 break;
245
246 case 0x2045: // left square bracket with quill
247 case 0x3010: // left black lenticular bracket
248 case 0x3016: // left white lenticular bracket
249 case 0x301A: // left white square bracket
250 case 0xFE3B: // vertical left lenticular bracket
251 ch = L'[';
252 break;
253
254 case 0x2046: // right square bracket with quill
255 case 0x3011: // right black lenticular bracket
256 case 0x3017: // right white lenticular bracket
257 case 0x301B: // right white square bracket
258 case 0xFE3C: // vertical right lenticular bracket
259 ch = L']';
260 break;
261
262 case 0x208D: // subscript left parenthesis
263 case 0x3014: // left tortise-shell bracket
264 case 0x3018: // left white tortise-shell bracket
265 case 0xFE35: // vertical left parenthesis
266 case 0xFE39: // vertical left tortise-shell bracket
267 case 0xFE59: // small left parenthesis
268 case 0xFE5D: // small left tortise-shell bracket
269 case 0xFF08: // chinese (
270 ch = L'(';
271 break;
272
273 case 0x208E: // subscript right parenthesis
274 case 0x3015: // right tortise-shell bracket
275 case 0x3019: // right white tortise-shell bracket
276 case 0xFE36: // vertical right parenthesis
277 case 0xFE3A: // vertical right tortise-shell bracket
278 case 0xFE5A: // small right parenthesis
279 case 0xFE5E: // small right tortise-shell bracket
280 case 0xFF09: // chinese )
281 ch = L')';
282 break;
283
284 case 0x3008: // left angle bracket
285 case 0x300A: // left double angle bracket
286 case 0xFF3D: // vertical left double angle bracket
287 case 0xFF3F: // vertical left angle bracket
288 case 0xFF64: // small less-than
289 case 0xFF1C: // 2007/04/09 add zenkaku <
290 ch = L'<';
291 break;
292
293 case 0x3009: // right angle bracket
294 case 0x300B: // right double angle bracket
295 case 0xFF3E: // vertical right double angle bracket
296 case 0xFF40: // vertical right angle bracket
297 case 0xFF65: // small greater-than
298 case 0xFF1E: // 2007/04/09 add zenkaku >
299 ch = L'>';
300 break;
301
302 case 0xFE37: // vertical left curly bracket
303 case 0xFE5B: // small left curly bracket
304 ch = L'{';
305 break;
306
307 case 0xFE38: // vertical right curly bracket
308 case 0xFE5C: // small right curly bracket
309 ch = L'}';
310 break;
311
312 case 0x00A1: // inve