Skip to content

Commit

Permalink
New UTF8 charset
Browse files Browse the repository at this point in the history
  • Loading branch information
bar@gw.udmsearch.izhnet.ru committed Mar 28, 2002
1 parent 1b54e7c commit 55e0a9c
Show file tree
Hide file tree
Showing 14 changed files with 2,088 additions and 54 deletions.
2 changes: 1 addition & 1 deletion configure.in
Expand Up @@ -1872,7 +1872,7 @@ CHARSETS_AVAILABLE="armscii8 big5 cp1251 cp1257
latin1 latin1_de latin2 latin5 sjis swe7 tis620 ujis
usa7 utf8 win1250 win1251ukr"
CHARSETS_DEPRECATED="win1251"
CHARSETS_COMPLEX="big5 czech euc_kr gb2312 gbk latin1_de sjis tis620 ujis"
CHARSETS_COMPLEX="big5 czech euc_kr gb2312 gbk latin1_de sjis tis620 ujis utf8"
DEFAULT_CHARSET=latin1
AC_DIVERT_POP

Expand Down
91 changes: 67 additions & 24 deletions include/m_ctype.h
Expand Up @@ -29,6 +29,22 @@ extern "C" {

#define CHARSET_DIR "charsets/"

#define my_wc_t ulong

typedef struct unicase_info_st {
uint16 toupper;
uint16 tolower;
uint16 sort;
} MY_UNICASE_INFO;

#define MY_CS_ILSEQ 0
#define MY_CS_ILUNI 0
#define MY_CS_TOOSMALL -1
#define MY_CS_TOOFEW(n) (-1-(n))




typedef struct charset_info_st
{
uint number;
Expand All @@ -48,9 +64,9 @@ typedef struct charset_info_st
char *, char *, uint *, uint *);

uint mbmaxlen;
int (*ismbchar)(const char *, const char *);
my_bool (*ismbhead)(uint);
int (*mbcharlen)(uint);
int (*ismbchar)(struct charset_info_st *, const char *, const char *);
my_bool (*ismbhead)(struct charset_info_st *, uint);
int (*mbcharlen)(struct charset_info_st *, uint);

/* Functions for case convertion */
void (*caseup_str)(struct charset_info_st *, char *);
Expand Down Expand Up @@ -107,9 +123,9 @@ extern int my_strnncoll_big5(CHARSET_INFO *,const uchar *, uint, const uchar
extern int my_strnxfrm_big5(CHARSET_INFO *,uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_big5(CHARSET_INFO *,const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_big5(const char *, const char *);
extern my_bool ismbhead_big5(uint);
extern int mbcharlen_big5(uint);
extern int ismbchar_big5(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_big5(CHARSET_INFO *, uint);
extern int mbcharlen_big5(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_czech
Expand All @@ -125,17 +141,17 @@ extern my_bool my_like_range_czech(CHARSET_INFO *,
#ifdef HAVE_CHARSET_euc_kr
/* declarations for the euc_kr character set */
extern uchar ctype_euc_kr[], to_lower_euc_kr[], to_upper_euc_kr[], sort_order_euc_kr[];
extern int ismbchar_euc_kr(const char *, const char *);
extern my_bool ismbhead_euc_kr(uint);
extern int mbcharlen_euc_kr(uint);
extern int ismbchar_euc_kr(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_euc_kr(CHARSET_INFO *, uint);
extern int mbcharlen_euc_kr(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_gb2312
/* declarations for the gb2312 character set */
extern uchar ctype_gb2312[], to_lower_gb2312[], to_upper_gb2312[], sort_order_gb2312[];
extern int ismbchar_gb2312(const char *, const char *);
extern my_bool ismbhead_gb2312(uint);
extern int mbcharlen_gb2312(uint);
extern int ismbchar_gb2312(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_gb2312(CHARSET_INFO *, uint);
extern int mbcharlen_gb2312(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_gbk
Expand All @@ -145,9 +161,9 @@ extern int my_strnncoll_gbk(CHARSET_INFO *, const uchar *, uint, const uchar
extern int my_strnxfrm_gbk(CHARSET_INFO *, uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_gbk(CHARSET_INFO *, const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_gbk(const char *, const char *);
extern my_bool ismbhead_gbk(uint);
extern int mbcharlen_gbk(uint);
extern int ismbchar_gbk(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_gbk(CHARSET_INFO *, uint);
extern int mbcharlen_gbk(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_latin1_de
Expand All @@ -166,9 +182,9 @@ extern int my_strnncoll_sjis(CHARSET_INFO *, const uchar *, uint, const ucha
extern int my_strnxfrm_sjis(CHARSET_INFO *, uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_sjis(CHARSET_INFO *, const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_sjis(const char *, const char *);
extern my_bool ismbhead_sjis(uint);
extern int mbcharlen_sjis(uint);
extern int ismbchar_sjis(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_sjis(CHARSET_INFO *, uint);
extern int mbcharlen_sjis(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_tis620
Expand All @@ -183,11 +199,38 @@ extern my_bool my_like_range_tis620(CHARSET_INFO *, const char *, uint, pchar, u
#ifdef HAVE_CHARSET_ujis
/* declarations for the ujis character set */
extern uchar ctype_ujis[], to_lower_ujis[], to_upper_ujis[], sort_order_ujis[];
extern int ismbchar_ujis(const char *, const char *);
extern my_bool ismbhead_ujis(uint);
extern int mbcharlen_ujis(uint);
extern int ismbchar_ujis(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_ujis(CHARSET_INFO *, uint);
extern int mbcharlen_ujis(CHARSET_INFO *, uint);
#endif

#ifdef HAVE_CHARSET_utf8

extern uchar ctype_utf8[];
extern uchar to_lower_utf8[];
extern uchar to_upper_utf8[];

int my_strnncoll_utf8(CHARSET_INFO *cs,
const uchar *s, uint s_len, const uchar *t, uint t_len);

int my_strnxfrm_utf8(CHARSET_INFO *cs,
uchar *dest, uint destlen, const uchar *src, uint srclen);

int my_ismbchar_utf8(CHARSET_INFO *cs, const char *b, const char *e);
my_bool my_ismbhead_utf8(CHARSET_INFO * cs, uint ch);
int my_mbcharlen_utf8(CHARSET_INFO *cs, uint c);

void my_caseup_str_utf8(CHARSET_INFO * cs, char * s);
void my_casedn_str_utf8(CHARSET_INFO *cs, char * s);
void my_caseup_utf8(CHARSET_INFO *cs, char *s, uint len);
void my_casedn_utf8(CHARSET_INFO *cs, char *s, uint len);

int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t);
int my_strncasecmp_utf8(CHARSET_INFO *cs, const char *s,const char *t,uint l);

int my_utf8_uni (CHARSET_INFO *cs, my_wc_t *p, const uchar *s, const uchar *e);
int my_uni_utf8 (CHARSET_INFO *cs, my_wc_t pwc , uchar *b, uchar *e);
#endif

#define _U 01 /* Upper case */
#define _L 02 /* Lower case */
Expand Down Expand Up @@ -229,9 +272,9 @@ extern int mbcharlen_ujis(uint);
((s)->like_range((s), (a), (b), (c), (d), (e), (f), (g), (h)))

#define use_mb(s) ((s)->ismbchar != NULL)
#define my_ismbchar(s, a, b) ((s)->ismbchar((a), (b)))
#define my_ismbhead(s, a) ((s)->ismbhead((a)))
#define my_mbcharlen(s, a) ((s)->mbcharlen((a)))
#define my_ismbchar(s, a, b) ((s)->ismbchar((s), (a), (b)))
#define my_ismbhead(s, a) ((s)->ismbhead((s), (a)))
#define my_mbcharlen(s, a) ((s)->mbcharlen((s),(a)))

#define my_caseup(s, a, l) ((s)->caseup((s), (a), (l)))
#define my_casedn(s, a, l) ((s)->casedn((s), (a), (l)))
Expand Down
2 changes: 1 addition & 1 deletion libmysql/Makefile.shared
Expand Up @@ -41,7 +41,7 @@ mystringsobjects = strmov.lo strxmov.lo strxnmov.lo strnmov.lo \
ctype.lo ctype-simple.lo ctype-mb.lo \
ctype-big5.lo ctype-czech.lo ctype-euc_kr.lo \
ctype-gb2312.lo ctype-gbk.lo ctype-latin1_de.lo \
ctype-sjis.lo ctype-tis620.lo ctype-ujis.lo
ctype-sjis.lo ctype-tis620.lo ctype-ujis.lo ctype-utf8.lo

mystringsextra= strto.c
dbugobjects = dbug.lo # IT IS IN SAFEMALLOC.C sanity.lo
Expand Down
5 changes: 4 additions & 1 deletion sql/init.cc
Expand Up @@ -57,7 +57,10 @@ void unireg_init(ulong options)

for (cs=compiled_charsets; cs->number; cs++)
{
uchar max_char=cs->sort_order[(uchar) cs->max_sort_char];
uchar max_char;
if (!cs->sort_order)
continue;
cs->sort_order[(uchar) cs->max_sort_char];
for (i = 0; i < 256; i++)
{
if ((uchar) cs->sort_order[i] > max_char)
Expand Down
1 change: 1 addition & 0 deletions sql/share/charsets/Index
Expand Up @@ -37,3 +37,4 @@ cp1257 29
latin5 30
latin1_de 31
armscii8 32
utf8 33
8 changes: 4 additions & 4 deletions strings/Makefile.am
Expand Up @@ -22,27 +22,27 @@ pkglib_LIBRARIES = libmystrings.a
# Exact one of ASSEMBLER_X
if ASSEMBLER_x86
ASRCS = strings-x86.s longlong2str-x86.s
CSRCS = bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c ctype-utf8.c
else
if ASSEMBLER_sparc
# These file MUST all be on the same line!! Otherwise automake
# generats a very broken makefile
ASRCS = bmove_upp-sparc.s strappend-sparc.s strend-sparc.s strinstr-sparc.s strmake-sparc.s strmov-sparc.s strnmov-sparc.s strstr-sparc.s strxmov-sparc.s
CSRCS = strcont.c strfill.c strcend.c is_prefix.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = strcont.c strfill.c strcend.c is_prefix.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.cctype-utf8.c
else
#no assembler
ASRCS =
# These file MUST all be on the same line!! Otherwise automake
# generats a very broken makefile
CSRCS = strxmov.c bmove_upp.c strappend.c strcont.c strend.c strfill.c strcend.c is_prefix.c strstr.c strinstr.c strmake.c strnmov.c strmov.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = strxmov.c bmove_upp.c strappend.c strcont.c strend.c strfill.c strcend.c is_prefix.c strstr.c strinstr.c strmake.c strnmov.c strmov.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c ctype-utf8.c
endif
endif

libmystrings_a_SOURCES = $(ASRCS) $(CSRCS)
noinst_PROGRAMS = conf_to_src
# Default charset definitions
EXTRA_DIST = ctype-big5.c ctype-czech.c ctype-euc_kr.c \
ctype-gb2312.c ctype-gbk.c ctype-sjis.c \
ctype-gb2312.c ctype-gbk.c ctype-sjis.c ctype-utf8.c \
ctype-tis620.c ctype-ujis.c ctype-latin1_de.c \
strto.c strings-x86.s longlong2str-x86.s \
strxmov.c bmove_upp.c strappend.c strcont.c strend.c \
Expand Down
6 changes: 3 additions & 3 deletions strings/ctype-big5.c
Expand Up @@ -378,17 +378,17 @@ my_bool my_like_range_big5(CHARSET_INFO *cs,
return 0;
}

int ismbchar_big5(const char* p, const char *e)
int ismbchar_big5(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isbig5head(*(p)) && (e)-(p)>1 && isbig5tail(*((p)+1))? 2: 0);
}

my_bool ismbhead_big5(uint c)
my_bool ismbhead_big5(CHARSET_INFO *cs, uint c)
{
return isbig5head(c);
}

int mbcharlen_big5(uint c)
int mbcharlen_big5(CHARSET_INFO *cs, uint c)
{
return (isbig5head(c)? 2: 0);
}
Expand Down
6 changes: 3 additions & 3 deletions strings/ctype-euc_kr.c
Expand Up @@ -183,19 +183,19 @@ uchar NEAR sort_order_euc_kr[]=
#define iseuc_kr(c) ((0xa1<=(uchar)(c) && (uchar)(c)<=0xfe))


int ismbchar_euc_kr(const char* p, const char *e)
int ismbchar_euc_kr(CHARSET_INFO *cs,const char* p, const char *e)
{
return ((*(uchar*)(p)<0x80)? 0:\
iseuc_kr(*(p)) && (e)-(p)>1 && iseuc_kr(*((p)+1))? 2:\
0);
}

my_bool ismbhead_euc_kr(uint c)
my_bool ismbhead_euc_kr(CHARSET_INFO *cs,uint c)
{
return (iseuc_kr(c));
}

int mbcharlen_euc_kr(uint c)
int mbcharlen_euc_kr(CHARSET_INFO *cs,uint c)
{
return (iseuc_kr(c) ? 2 : 0);
}
Expand Down
6 changes: 3 additions & 3 deletions strings/ctype-gb2312.c
Expand Up @@ -166,17 +166,17 @@ uchar NEAR sort_order_gb2312[]=
#define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe)


int ismbchar_gb2312(const char* p, const char *e)
int ismbchar_gb2312(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isgb2312head(*(p)) && (e)-(p)>1 && isgb2312tail(*((p)+1))? 2: 0);
}

my_bool ismbhead_gb2312(uint c)
my_bool ismbhead_gb2312(CHARSET_INFO *cs,uint c)
{
return isgb2312head(c);
}

int mbcharlen_gb2312(uint c)
int mbcharlen_gb2312(CHARSET_INFO *cs,uint c)
{
return (isgb2312head(c)? 2:0);
}
Expand Down
6 changes: 3 additions & 3 deletions strings/ctype-gbk.c
Expand Up @@ -2704,17 +2704,17 @@ extern my_bool my_like_range_gbk(CHARSET_INFO *cs,
}


int ismbchar_gbk(const char* p, const char *e)
int ismbchar_gbk(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isgbkhead(*(p)) && (e)-(p)>1 && isgbktail(*((p)+1))? 2: 0);
}

my_bool ismbhead_gbk(uint c)
my_bool ismbhead_gbk(CHARSET_INFO *cs,uint c)
{
return isgbkhead(c);
}

int mbcharlen_gbk(uint c)
int mbcharlen_gbk(CHARSET_INFO *cs,uint c)
{
return (isgbkhead(c)? 2:0);
}
Expand Down
16 changes: 8 additions & 8 deletions strings/ctype-sjis.c
Expand Up @@ -183,17 +183,17 @@ uchar NEAR sort_order_sjis[]=
(0x80<=(c) && (c)<=0xfc))


int ismbchar_sjis(const char* p, const char *e)
int ismbchar_sjis(CHARSET_INFO *cs,const char* p, const char *e)
{
return (issjishead((uchar) *p) && (e-p)>1 && issjistail((uchar)p[1]) ? 2: 0);
}

my_bool ismbhead_sjis(uint c)
my_bool ismbhead_sjis(CHARSET_INFO *cs,uint c)
{
return issjishead((uchar) c);
}

int mbcharlen_sjis(uint c)
int mbcharlen_sjis(CHARSET_INFO *cs,uint c)
{
return (issjishead((uchar) c) ? 2: 0);
}
Expand All @@ -208,8 +208,8 @@ int my_strnncoll_sjis(CHARSET_INFO *cs,
const uchar *e1 = s1 + len1;
const uchar *e2 = s2 + len2;
while (s1 < e1 && s2 < e2) {
if (ismbchar_sjis((char*) s1, (char*) e1) &&
ismbchar_sjis((char*) s2, (char*) e2)) {
if (ismbchar_sjis(cs,(char*) s1, (char*) e1) &&
ismbchar_sjis(cs,(char*) s2, (char*) e2)) {
uint c1 = sjiscode(*s1, *(s1+1));
uint c2 = sjiscode(*s2, *(s2+1));
if (c1 != c2)
Expand All @@ -233,7 +233,7 @@ int my_strnxfrm_sjis(CHARSET_INFO *cs,
uchar *d_end = dest + len;
uchar *s_end = (uchar*) src + srclen;
while (dest < d_end && src < s_end) {
if (ismbchar_sjis((char*) src, (char*) s_end)) {
if (ismbchar_sjis(cs,(char*) src, (char*) s_end)) {
*dest++ = *src++;
if (dest < d_end && src < s_end)
*dest++ = *src++;
Expand Down Expand Up @@ -275,15 +275,15 @@ my_bool my_like_range_sjis(CHARSET_INFO *cs,
char *min_end=min_str+res_length;

while (ptr < end && min_str < min_end) {
if (ismbchar_sjis(ptr, end)) {
if (ismbchar_sjis(cs, ptr, end)) {
*min_str++ = *max_str++ = *ptr++;
if (min_str < min_end)
*min_str++ = *max_str++ = *ptr++;
continue;
}
if (*ptr == escape && ptr+1 < end) {
ptr++; /* Skip escape */
if (ismbchar_sjis(ptr, end))
if (ismbchar_sjis(cs, ptr, end))
*min_str++ = *max_str++ = *ptr++;
if (min_str < min_end)
*min_str++ = *max_str++ = *ptr++;
Expand Down
6 changes: 3 additions & 3 deletions strings/ctype-ujis.c
Expand Up @@ -183,7 +183,7 @@ uchar NEAR sort_order_ujis[]=
#define isujis_ss3(c) (((c)&0xff) == 0x8f)


int ismbchar_ujis(const char* p, const char *e)
int ismbchar_ujis(CHARSET_INFO *cs,const char* p, const char *e)
{
return ((*(uchar*)(p)<0x80)? 0:\
isujis(*(p)) && (e)-(p)>1 && isujis(*((p)+1))? 2:\
Expand All @@ -192,12 +192,12 @@ int ismbchar_ujis(const char* p, const char *e)
0);
}

my_bool ismbhead_ujis(uint c)
my_bool ismbhead_ujis(CHARSET_INFO *cs,uint c)
{
return (isujis(c) || isujis_ss2(c) || isujis_ss3(c));
}

int mbcharlen_ujis(uint c)
int mbcharlen_ujis(CHARSET_INFO *cs,uint c)
{
return (isujis(c)? 2: isujis_ss2(c)? 2: isujis_ss3(c)? 3: 0);
}
Expand Down

0 comments on commit 55e0a9c

Please sign in to comment.