d2/db5/utf8_8cpp_source.html

 #include "pfc.h"


 namespace pfc {

 //utf8 stuff


 static const t_uint8 mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};


 static const t_uint8 val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};


 t_size utf8_char_len_from_header(char p_c) throw()

 {

     t_size cnt = 0;

     for(;;)

     {

         if ((p_c & mask_tab[cnt])==val_tab[cnt]) break;

         if (++cnt>=6) return 0;

     }


     return cnt + 1;


 }

 t_size utf8_decode_char(const char *p_utf8,unsigned & wide) throw() {

     const t_uint8 * utf8 = (const t_uint8*)p_utf8;

     const t_size max = 6;


     if (utf8[0]<0x80) {

         wide = utf8[0];

         return utf8[0]>0 ? 1 : 0;

     }

     wide = 0;


     unsigned res=0;

     unsigned n;

     unsigned cnt=0;

     for(;;)

     {

         if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;

         if (++cnt>=max) return 0;

     }

     cnt++;


     if (cnt==2 && !(*utf8&0x1E)) return 0;


     if (cnt==1)

         res=*utf8;

     else

         res=(0xFF>>(cnt+1))&*utf8;


     for (n=1;n<cnt;n++)

     {

         if ((utf8[n]&0xC0) != 0x80)

             return 0;

         if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))

             return 0;


         res=(res<<6)|(utf8[n]&0x3F);

     }


     wide = res;


     return cnt;

 }


 t_size utf8_decode_char(const char *p_utf8,unsigned & wide,t_size max) throw()

 {

     const t_uint8 * utf8 = (const t_uint8*)p_utf8;


     if (max==0) {

         wide = 0;

         return 0;

     }


     if (utf8[0]<0x80) {

         wide = utf8[0];

         return utf8[0]>0 ? 1 : 0;

     }

     if (max>6) max = 6;

     wide = 0;


     unsigned res=0;

     unsigned n;

     unsigned cnt=0;

     for(;;)

     {

         if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;

         if (++cnt>=max) return 0;

     }

     cnt++;


     if (cnt==2 && !(*utf8&0x1E)) return 0;


     if (cnt==1)

         res=*utf8;

     else

         res=(0xFF>>(cnt+1))&*utf8;


     for (n=1;n<cnt;n++)

     {

         if ((utf8[n]&0xC0) != 0x80)

             return 0;

         if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))

             return 0;


         res=(res<<6)|(utf8[n]&0x3F);

     }


     wide = res;


     return cnt;

 }


 t_size utf8_encode_char(unsigned wide,char * target) throw()

 {

     t_size count;


     if (wide < 0x80)

         count = 1;

     else if (wide < 0x800)

         count = 2;

     else if (wide < 0x10000)

         count = 3;

     else if (wide < 0x200000)

         count = 4;

     else if (wide < 0x4000000)

         count = 5;

     else if (wide <= 0x7FFFFFFF)

         count = 6;

     else

         return 0;

     //if (count>max) return 0;


     if (target == 0)

         return count;


     switch (count)

     {

     case 6:

         target[5] = 0x80 | (wide & 0x3F);

         wide = wide >> 6;

         wide |= 0x4000000;

     case 5:

         target[4] = 0x80 | (wide & 0x3F);

         wide = wide >> 6;

         wide |= 0x200000;

     case 4:

         target[3] = 0x80 | (wide & 0x3F);

         wide = wide >> 6;

         wide |= 0x10000;

     case 3:

         target[2] = 0x80 | (wide & 0x3F);

         wide = wide >> 6;

         wide |= 0x800;

     case 2:

         target[1] = 0x80 | (wide & 0x3F);

         wide = wide >> 6;

         wide |= 0xC0;

     case 1:

         target[0] = wide;

     }


     return count;

 }


 t_size utf16_encode_char(unsigned cur_wchar,char16_t * out) throw()

 {

     if (cur_wchar < 0x10000) {

         *out = (char16_t) cur_wchar; return 1;

     } else if (cur_wchar < (1 << 20)) {

         unsigned c = cur_wchar - 0x10000;

         //MSDN:

         //The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.

         out[0] = (char16_t)(0xD800 | (0x3FF & (c>>10)) );

         out[1] = (char16_t)(0xDC00 | (0x3FF & c) ) ;

         return 2;

     } else {

         *out = '?'; return 1;

     }

 }


 t_size utf16_decode_char(const char16_t * p_source,unsigned * p_out,t_size p_source_length) throw() {

     if (p_source_length == 0) {*p_out = 0; return 0; }

     else if (p_source_length == 1) {

         *p_out = p_source[0];

         return 1;

     } else {

         t_size retval = 0;

         unsigned decoded = p_source[0];

         if (decoded != 0)

         {

             retval = 1;

             if ((decoded & 0xFC00) == 0xD800)

             {

                 unsigned low = p_source[1];

                 if ((low & 0xFC00) == 0xDC00)

                 {

                     decoded = 0x10000 + ( ((decoded & 0x3FF) << 10) | (low & 0x3FF) );

                     retval = 2;

                 }

             }

         }

         *p_out = decoded;

         return retval;

     }

 }

 #ifdef _MSC_VER

     t_size utf16_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {

         PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );

         return wide_decode_char( p_source, p_out, p_source_length );

     }

     t_size utf16_encode_char(unsigned c,wchar_t * out) throw() {

         PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );

         return wide_encode_char( c, out );

     }

 #endif


     t_size wide_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {

         PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );

         if (sizeof( wchar_t ) == sizeof( char16_t ) ) {

             return utf16_decode_char( reinterpret_cast< const char16_t *>(p_source), p_out, p_source_length );

         } else {

             if (p_source_length == 0) { * p_out = 0; return 0; }

             * p_out = p_source [ 0 ];

             return 1;

         }

     }

     t_size wide_encode_char(unsigned c,wchar_t * out) throw() {

         PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );

         if (sizeof( wchar_t ) == sizeof( char16_t ) ) {

             return utf16_encode_char( c, reinterpret_cast< char16_t * >(out) );

         } else {

             * out = (wchar_t) c;

             return 1;

         }

     }


 unsigned utf8_get_char(const char * src)

 {

     unsigned rv = 0;

     utf8_decode_char(src,rv);

     return rv;

 }


 t_size utf8_char_len(const char * s,t_size max) throw()

 {

     unsigned dummy;

     return utf8_decode_char(s,dummy,max);

 }


 t_size skip_utf8_chars(const char * ptr,t_size count) throw()

 {

     t_size num = 0;

     for(;count && ptr[num];count--)

     {

         t_size d = utf8_char_len(ptr+num);

         if (d<=0) break;

         num+=d;

     }

     return num;

 }


 bool is_valid_utf8(const char * param,t_size max) {

     t_size walk = 0;

     while(walk < max && param[walk] != 0) {

         t_size d;

         unsigned dummy;

         d = utf8_decode_char(param + walk,dummy,max - walk);

         if (d==0) return false;

         walk += d;

         if (walk > max) {

             PFC_ASSERT(0);//should not be triggerable

             return false;

         }

     }

     return true;

 }


 bool is_lower_ascii(const char * param)

 {

     while(*param)

     {

         if (*param<0) return false;

         param++;

     }

     return true;

 }


 static bool check_end_of_string(const char * ptr)

 {

     return !*ptr;

 }


 unsigned strcpy_utf8_truncate(const char * src,char * out,unsigned maxbytes)

 {

     unsigned rv = 0 , ptr = 0;

     if (maxbytes>0)

     {

         maxbytes--;//for null

         while(!check_end_of_string(src) && maxbytes>0)

         {

             t_size delta = utf8_char_len(src);

             if (delta>maxbytes || delta==0) break;

             do

             {

                 out[ptr++] = *(src++);

             } while(--delta);

             rv = ptr;

         }

         out[rv]=0;

     }

     return rv;

 }


 t_size strlen_utf8(const char * p,t_size num) throw()

 {

     unsigned w;

     t_size d;

     t_size ret = 0;

     for(;num;)

     {

         d = utf8_decode_char(p,w);

         if (w==0 || d<=0) break;

         ret++;

         p+=d;

         num-=d;

     }

     return ret;

 }


 t_size utf8_chars_to_bytes(const char * string,t_size count) throw()

 {

     t_size bytes = 0;

     while(count)

     {

         unsigned dummy;

         t_size delta = utf8_decode_char(string+bytes,dummy);

         if (delta==0) break;

         bytes += delta;

         count--;

     }

     return bytes;

 }


 }

pfc::utf16_encode_char
t_size utf16_encode_char(unsigned c, char16_t *out)
Definition: utf8.cpp:165

pfc::skip_utf8_chars
t_size skip_utf8_chars(const char *ptr, t_size count)
Definition: utf8.cpp:252

t_uint8
uint8_t t_uint8
Definition: int_types.h:9

pfc::strcpy_utf8_truncate
t_size strcpy_utf8_truncate(const char *src, char *out, t_size maxbytes)

pfc::utf8_encode_char
t_size utf8_encode_char(unsigned c, char *out)
Definition: utf8.cpp:113

pfc.h

pfc::wide_decode_char
t_size wide_decode_char(const wchar_t *p_source, unsigned *p_out, t_size p_source_length=~0)
Definition: utf8.cpp:217

pfc
Definition: file_info_impl.h:22

pfc::mask_tab
static const t_uint8 mask_tab[6]
Definition: utf8.cpp:6

pfc::utf8_get_char
unsigned utf8_get_char(const char *src)
Definition: utf8.cpp:238

pfc::utf8_char_len_from_header
t_size utf8_char_len_from_header(char c)
Definition: utf8.cpp:10

pfc::strlen_utf8
t_size strlen_utf8(const char *s, t_size num=~0)
Definition: utf8.cpp:316

t_size
size_t t_size
Definition: int_types.h:48

pfc::val_tab
static const t_uint8 val_tab[6]
Definition: utf8.cpp:8

pfc::utf16_decode_char
t_size utf16_decode_char(const char16_t *p_source, unsigned *p_out, t_size p_source_length=~0)
Definition: utf8.cpp:181

pfc::utf8_decode_char
t_size utf8_decode_char(const char *src, unsigned &out, t_size src_bytes)
Definition: utf8.cpp:64

pfc::is_valid_utf8
bool is_valid_utf8(const char *param, t_size max=~0)
Definition: utf8.cpp:264

pfc::wide_encode_char
t_size wide_encode_char(unsigned c, wchar_t *out)
Definition: utf8.cpp:227

pfc::check_end_of_string
static bool check_end_of_string(const char *ptr)
Definition: utf8.cpp:290

pfc::utf8_chars_to_bytes
t_size utf8_chars_to_bytes(const char *string, t_size count)
Definition: utf8.cpp:332

pfc::utf8_char_len
t_size utf8_char_len(const char *s, t_size max=~0)
Definition: utf8.cpp:246

pfc::is_lower_ascii
bool is_lower_ascii(const char *param)
Definition: utf8.cpp:280