foobar2000 SDK  2015-08-03
utf8.cpp
Go to the documentation of this file.
1 #include "pfc.h"
2 
3 namespace pfc {
4 //utf8 stuff
5 
6 static const t_uint8 mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};
7 
8 static const t_uint8 val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};
9 
11 {
12  t_size cnt = 0;
13  for(;;)
14  {
15  if ((p_c & mask_tab[cnt])==val_tab[cnt]) break;
16  if (++cnt>=6) return 0;
17  }
18 
19  return cnt + 1;
20 
21 }
22 t_size utf8_decode_char(const char *p_utf8,unsigned & wide) throw() {
23  const t_uint8 * utf8 = (const t_uint8*)p_utf8;
24  const t_size max = 6;
25 
26  if (utf8[0]<0x80) {
27  wide = utf8[0];
28  return utf8[0]>0 ? 1 : 0;
29  }
30  wide = 0;
31 
32  unsigned res=0;
33  unsigned n;
34  unsigned cnt=0;
35  for(;;)
36  {
37  if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
38  if (++cnt>=max) return 0;
39  }
40  cnt++;
41 
42  if (cnt==2 && !(*utf8&0x1E)) return 0;
43 
44  if (cnt==1)
45  res=*utf8;
46  else
47  res=(0xFF>>(cnt+1))&*utf8;
48 
49  for (n=1;n<cnt;n++)
50  {
51  if ((utf8[n]&0xC0) != 0x80)
52  return 0;
53  if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
54  return 0;
55 
56  res=(res<<6)|(utf8[n]&0x3F);
57  }
58 
59  wide = res;
60 
61  return cnt;
62 }
63 
64 t_size utf8_decode_char(const char *p_utf8,unsigned & wide,t_size max) throw()
65 {
66  const t_uint8 * utf8 = (const t_uint8*)p_utf8;
67 
68  if (max==0) {
69  wide = 0;
70  return 0;
71  }
72 
73  if (utf8[0]<0x80) {
74  wide = utf8[0];
75  return utf8[0]>0 ? 1 : 0;
76  }
77  if (max>6) max = 6;
78  wide = 0;
79 
80  unsigned res=0;
81  unsigned n;
82  unsigned cnt=0;
83  for(;;)
84  {
85  if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
86  if (++cnt>=max) return 0;
87  }
88  cnt++;
89 
90  if (cnt==2 && !(*utf8&0x1E)) return 0;
91 
92  if (cnt==1)
93  res=*utf8;
94  else
95  res=(0xFF>>(cnt+1))&*utf8;
96 
97  for (n=1;n<cnt;n++)
98  {
99  if ((utf8[n]&0xC0) != 0x80)
100  return 0;
101  if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
102  return 0;
103 
104  res=(res<<6)|(utf8[n]&0x3F);
105  }
106 
107  wide = res;
108 
109  return cnt;
110 }
111 
112 
113 t_size utf8_encode_char(unsigned wide,char * target) throw()
114 {
115  t_size count;
116 
117  if (wide < 0x80)
118  count = 1;
119  else if (wide < 0x800)
120  count = 2;
121  else if (wide < 0x10000)
122  count = 3;
123  else if (wide < 0x200000)
124  count = 4;
125  else if (wide < 0x4000000)
126  count = 5;
127  else if (wide <= 0x7FFFFFFF)
128  count = 6;
129  else
130  return 0;
131  //if (count>max) return 0;
132 
133  if (target == 0)
134  return count;
135 
136  switch (count)
137  {
138  case 6:
139  target[5] = 0x80 | (wide & 0x3F);
140  wide = wide >> 6;
141  wide |= 0x4000000;
142  case 5:
143  target[4] = 0x80 | (wide & 0x3F);
144  wide = wide >> 6;
145  wide |= 0x200000;
146  case 4:
147  target[3] = 0x80 | (wide & 0x3F);
148  wide = wide >> 6;
149  wide |= 0x10000;
150  case 3:
151  target[2] = 0x80 | (wide & 0x3F);
152  wide = wide >> 6;
153  wide |= 0x800;
154  case 2:
155  target[1] = 0x80 | (wide & 0x3F);
156  wide = wide >> 6;
157  wide |= 0xC0;
158  case 1:
159  target[0] = wide;
160  }
161 
162  return count;
163 }
164 
165 t_size utf16_encode_char(unsigned cur_wchar,char16_t * out) throw()
166 {
167  if (cur_wchar < 0x10000) {
168  *out = (char16_t) cur_wchar; return 1;
169  } else if (cur_wchar < (1 << 20)) {
170  unsigned c = cur_wchar - 0x10000;
171  //MSDN:
172  //The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.
173  out[0] = (char16_t)(0xD800 | (0x3FF & (c>>10)) );
174  out[1] = (char16_t)(0xDC00 | (0x3FF & c) ) ;
175  return 2;
176  } else {
177  *out = '?'; return 1;
178  }
179 }
180 
181 t_size utf16_decode_char(const char16_t * p_source,unsigned * p_out,t_size p_source_length) throw() {
182  if (p_source_length == 0) {*p_out = 0; return 0; }
183  else if (p_source_length == 1) {
184  *p_out = p_source[0];
185  return 1;
186  } else {
187  t_size retval = 0;
188  unsigned decoded = p_source[0];
189  if (decoded != 0)
190  {
191  retval = 1;
192  if ((decoded & 0xFC00) == 0xD800)
193  {
194  unsigned low = p_source[1];
195  if ((low & 0xFC00) == 0xDC00)
196  {
197  decoded = 0x10000 + ( ((decoded & 0x3FF) << 10) | (low & 0x3FF) );
198  retval = 2;
199  }
200  }
201  }
202  *p_out = decoded;
203  return retval;
204  }
205 }
206 #ifdef _MSC_VER
207  t_size utf16_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {
208  PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );
209  return wide_decode_char( p_source, p_out, p_source_length );
210  }
211  t_size utf16_encode_char(unsigned c,wchar_t * out) throw() {
212  PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );
213  return wide_encode_char( c, out );
214  }
215 #endif
216 
217  t_size wide_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {
218  PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );
219  if (sizeof( wchar_t ) == sizeof( char16_t ) ) {
220  return utf16_decode_char( reinterpret_cast< const char16_t *>(p_source), p_out, p_source_length );
221  } else {
222  if (p_source_length == 0) { * p_out = 0; return 0; }
223  * p_out = p_source [ 0 ];
224  return 1;
225  }
226  }
227  t_size wide_encode_char(unsigned c,wchar_t * out) throw() {
228  PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );
229  if (sizeof( wchar_t ) == sizeof( char16_t ) ) {
230  return utf16_encode_char( c, reinterpret_cast< char16_t * >(out) );
231  } else {
232  * out = (wchar_t) c;
233  return 1;
234  }
235  }
236 
237 
238 unsigned utf8_get_char(const char * src)
239 {
240  unsigned rv = 0;
241  utf8_decode_char(src,rv);
242  return rv;
243 }
244 
245 
246 t_size utf8_char_len(const char * s,t_size max) throw()
247 {
248  unsigned dummy;
249  return utf8_decode_char(s,dummy,max);
250 }
251 
252 t_size skip_utf8_chars(const char * ptr,t_size count) throw()
253 {
254  t_size num = 0;
255  for(;count && ptr[num];count--)
256  {
257  t_size d = utf8_char_len(ptr+num);
258  if (d<=0) break;
259  num+=d;
260  }
261  return num;
262 }
263 
264 bool is_valid_utf8(const char * param,t_size max) {
265  t_size walk = 0;
266  while(walk < max && param[walk] != 0) {
267  t_size d;
268  unsigned dummy;
269  d = utf8_decode_char(param + walk,dummy,max - walk);
270  if (d==0) return false;
271  walk += d;
272  if (walk > max) {
273  PFC_ASSERT(0);//should not be triggerable
274  return false;
275  }
276  }
277  return true;
278 }
279 
280 bool is_lower_ascii(const char * param)
281 {
282  while(*param)
283  {
284  if (*param<0) return false;
285  param++;
286  }
287  return true;
288 }
289 
290 static bool check_end_of_string(const char * ptr)
291 {
292  return !*ptr;
293 }
294 
295 unsigned strcpy_utf8_truncate(const char * src,char * out,unsigned maxbytes)
296 {
297  unsigned rv = 0 , ptr = 0;
298  if (maxbytes>0)
299  {
300  maxbytes--;//for null
301  while(!check_end_of_string(src) && maxbytes>0)
302  {
303  t_size delta = utf8_char_len(src);
304  if (delta>maxbytes || delta==0) break;
305  do
306  {
307  out[ptr++] = *(src++);
308  } while(--delta);
309  rv = ptr;
310  }
311  out[rv]=0;
312  }
313  return rv;
314 }
315 
316 t_size strlen_utf8(const char * p,t_size num) throw()
317 {
318  unsigned w;
319  t_size d;
320  t_size ret = 0;
321  for(;num;)
322  {
323  d = utf8_decode_char(p,w);
324  if (w==0 || d<=0) break;
325  ret++;
326  p+=d;
327  num-=d;
328  }
329  return ret;
330 }
331 
332 t_size utf8_chars_to_bytes(const char * string,t_size count) throw()
333 {
334  t_size bytes = 0;
335  while(count)
336  {
337  unsigned dummy;
338  t_size delta = utf8_decode_char(string+bytes,dummy);
339  if (delta==0) break;
340  bytes += delta;
341  count--;
342  }
343  return bytes;
344 }
345 
346 }
t_size utf16_encode_char(unsigned c, char16_t *out)
Definition: utf8.cpp:165
t_size skip_utf8_chars(const char *ptr, t_size count)
Definition: utf8.cpp:252
uint8_t t_uint8
Definition: int_types.h:9
t_size strcpy_utf8_truncate(const char *src, char *out, t_size maxbytes)
t_size utf8_encode_char(unsigned c, char *out)
Definition: utf8.cpp:113
t_size wide_decode_char(const wchar_t *p_source, unsigned *p_out, t_size p_source_length=~0)
Definition: utf8.cpp:217
static const t_uint8 mask_tab[6]
Definition: utf8.cpp:6
unsigned utf8_get_char(const char *src)
Definition: utf8.cpp:238
t_size utf8_char_len_from_header(char c)
Definition: utf8.cpp:10
t_size strlen_utf8(const char *s, t_size num=~0)
Definition: utf8.cpp:316
size_t t_size
Definition: int_types.h:48
static const t_uint8 val_tab[6]
Definition: utf8.cpp:8
t_size utf16_decode_char(const char16_t *p_source, unsigned *p_out, t_size p_source_length=~0)
Definition: utf8.cpp:181
t_size utf8_decode_char(const char *src, unsigned &out, t_size src_bytes)
Definition: utf8.cpp:64
bool is_valid_utf8(const char *param, t_size max=~0)
Definition: utf8.cpp:264
t_size wide_encode_char(unsigned c, wchar_t *out)
Definition: utf8.cpp:227
static bool check_end_of_string(const char *ptr)
Definition: utf8.cpp:290
t_size utf8_chars_to_bytes(const char *string, t_size count)
Definition: utf8.cpp:332
t_size utf8_char_len(const char *s, t_size max=~0)
Definition: utf8.cpp:246
bool is_lower_ascii(const char *param)
Definition: utf8.cpp:280