From 1217289737588e65b088b3535428b27c7287d699 Mon Sep 17 00:00:00 2001 From: Manuel Novoa III Date: Fri, 1 Aug 2003 20:08:59 +0000 Subject: Add a new *scanf implementation, includeing the *wscanf functions. Should be standards compliant and with several optional features, including support for hexadecimal float notation, locale awareness, glibc-like locale-specific digit grouping with the `'' flag, and positional arg support. I tested it pretty well (finding several bugs in glibc's scanf in the process), but it is brand new so be aware. The *wprintf functions now support floating point output. Also, a couple of bugs were squashed. Finally, %a/%A conversions are now implemented. Implement the glibc xlocale interface for thread-specific locale support. Also add the various *_l(args, locale_t loc_arg) funcs. NOTE!!! setlocale() is NOT threadsafe! NOTE!!! The strto{floating point} conversion functions are now locale aware. The also now support hexadecimal floating point notation. Add the wcsto{floating point} conversion functions. Fix a bug in mktime() related to dst. Note that unlike glibc's mktime, uClibc's version always normalizes the struct tm before attempting to determine the correct dst setting if tm_isdst == -1 on entry. Add a stub version of the libintl functions. (untested) Fixed a known memory leak in setlocale() related to the collation data. Add lots of new config options (which Erik agreed to sort out :-), including finally exposing some of the stripped down stdio configs. Be careful with those though, as they haven't been tested in a long time. (temporary) GOTCHAs... The ctype functions are currently incorrect for 8-bit locales. They will be fixed shortly. The ctype functions are now table-based, resulting in larger staticly linked binaries. I'll be adding an option to use the old approach in the stub locale configuration. --- libc/misc/wchar/Makefile | 1 - libc/misc/wchar/wchar.c | 111 ++++++++++++++++++++++++++++++++--------------- libc/misc/wchar/wstdio.c | 37 +++++++++++----- 3 files changed, 102 insertions(+), 47 deletions(-) (limited to 'libc/misc/wchar') diff --git a/libc/misc/wchar/Makefile b/libc/misc/wchar/Makefile index acc852195..4a3e2fc7a 100644 --- a/libc/misc/wchar/Makefile +++ b/libc/misc/wchar/Makefile @@ -42,7 +42,6 @@ MOBJ2= fwide.o \ # getwc (fgetwc alias) getwc_unlocked (fgetwc_unlocked alias) # putwc (fputwc alias) putwc_unlocked (fputwc_unlocked alias) -# fwscanf wscanf swscanf vfwscanf vwscanf vswscanf # wcsftime OBJS=$(MOBJ1) $(MOBJ2) diff --git a/libc/misc/wchar/wchar.c b/libc/misc/wchar/wchar.c index ff3e42a84..aa0c2735a 100644 --- a/libc/misc/wchar/wchar.c +++ b/libc/misc/wchar/wchar.c @@ -86,6 +86,9 @@ * Add a couple of ugly hacks to support *wprintf. * Add a mini iconv() and iconv implementation (requires locale support). * + * Aug 1, 2003 + * Bug fix for mbrtowc. + * * Manuel */ @@ -101,13 +104,39 @@ #include #include #include +#include +/**********************************************************************/ #ifdef __UCLIBC_HAS_LOCALE__ -#define ENCODING (__global_locale.encoding) +#ifdef __UCLIBC_MJN3_ONLY__ +#ifdef L_iswspace +/* generates one warning */ +#warning TODO: Fix Cc2wc* and Cwc2c* defines! +#endif +#endif /* __UCLIBC_MJN3_ONLY__ */ + +#define ENCODING ((__UCLIBC_CURLOCALE_DATA).encoding) + +#define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT +#define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN +#define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX +#define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT +#define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT +#define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN + #ifndef __CTYPE_HAS_UTF_8_LOCALES #warning __CTYPE_HAS_UTF_8_LOCALES not set! #endif -#else + +#else /* __UCLIBC_HAS_LOCALE__ */ + +#ifdef __UCLIBC_MJN3_ONLY__ +#ifdef L_btowc +/* emit only once */ +#warning fix preprocessor logic testing locale settings +#endif +#endif + #define ENCODING (__ctype_encoding_7_bit) #ifdef __CTYPE_HAS_8_BIT_LOCALES #error __CTYPE_HAS_8_BIT_LOCALES is defined! @@ -117,7 +146,9 @@ #endif #undef L__wchar_utf8sntowcs #undef L__wchar_wcsntoutf8s -#endif + +#endif /* __UCLIBC_HAS_LOCALE__ */ +/**********************************************************************/ #if WCHAR_MAX > 0xffffUL #define UTF_8_MAX_LEN 6 @@ -266,11 +297,18 @@ size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, #ifdef __CTYPE_HAS_UTF_8_LOCALES /* Need to do this here since mbsrtowcs doesn't allow incompletes. */ if (ENCODING == __ctype_encoding_utf8) { + if (!pwc) { + pwc = wcbuf; + } r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1); return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */ } #endif +#ifdef __UCLIBC_MJN3_ONLY__ +#warning TODO: This adds a trailing nul! +#endif /* __UCLIBC_MJN3_ONLY__ */ + r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps); if (((ssize_t) r) >= 0) { @@ -291,7 +329,10 @@ size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t wcrtomb(register char *__restrict s, wchar_t wc, mbstate_t *__restrict ps) { - wchar_t wcbuf[2]; +#ifdef __UCLIBC_MJN3_ONLY__ +#warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc. +#endif /* __UCLIBC_MJN3_ONLY__ */ + wchar_t wcbuf[1]; const wchar_t *pwc; size_t r; char buf[MB_LEN_MAX]; @@ -303,9 +344,8 @@ size_t wcrtomb(register char *__restrict s, wchar_t wc, pwc = wcbuf; wcbuf[0] = wc; - wcbuf[1] = 0; - r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps); + r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps); return (r != 0) ? r : 1; } @@ -418,7 +458,7 @@ size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */ mask = 0x40; #ifdef __UCLIBC_MJN3_ONLY__ -#warning fix range for 16 bit wides +#warning TODO: Fix range for 16 bit wchar_t case. #endif if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) { goto START; @@ -495,7 +535,6 @@ size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, COMPLETE: *pwc = wc; pwc += incr; - } #ifdef DECODER while (--count); @@ -684,8 +723,8 @@ size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src, while (count) { if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */ wc -= 0x80; - wc = __global_locale.tbl8c2wc[ - (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT] + wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[ + (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT] << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; if (!wc) { goto BAD; @@ -797,12 +836,12 @@ size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src, } else { u = 0; if (wc <= Cwc2c_DOMAIN_MAX) { - u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT + u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)]; - u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + ((wc >> Cwc2c_TT_SHIFT) & ((1 << Cwc2c_TI_SHIFT)-1))]; - u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN + u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN + (u << Cwc2c_TT_SHIFT) + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; } @@ -859,7 +898,8 @@ size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src, #ifdef L_wcswidth #ifdef __UCLIBC_MJN3_ONLY__ -#warning if we start doing translit, wcwidth and wcswidth will need updating. +#warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating. +#warning TODO: Update wcwidth to match latest by Kuhn. #endif #if defined(__UCLIBC_HAS_LOCALE__) && \ @@ -1163,7 +1203,7 @@ enum { * */ -const unsigned char codesets[] = +const unsigned char __iconv_codesets[] = "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */ #if __BYTE_ORDER == __BIG_ENDIAN "\x08\xec""UCS-4\x00" /* always BE */ @@ -1201,7 +1241,7 @@ static int find_codeset(const char *name) const unsigned char *s; int codeset; - for (s = codesets ; *s ; s += *s) { + for (s = __iconv_codesets ; *s ; s += *s) { if (!strcasecmp(s+2, name)) { return s[1]; } @@ -1212,10 +1252,10 @@ static int find_codeset(const char *name) /* TODO: maybe CODESET_LIST + *s ??? */ /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */ codeset = 2; - s = CODESET_LIST; + s = __LOCALE_DATA_CODESET_LIST; do { ++codeset; /* Increment codeset first. */ - if (!strcasecmp(CODESET_LIST+*s, name)) { + if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) { return codeset; } } while (*++s); @@ -1223,7 +1263,7 @@ static int find_codeset(const char *name) return 0; /* No matching codeset! */ } -iconv_t iconv_open(const char *tocode, const char *fromcode) +iconv_t weak_function iconv_open(const char *tocode, const char *fromcode) { register _UC_iconv_t *px; int tocodeset, fromcodeset; @@ -1244,16 +1284,17 @@ iconv_t iconv_open(const char *tocode, const char *fromcode) return (iconv_t)(-1); } -int iconv_close(iconv_t cd) +int weak_function iconv_close(iconv_t cd) { free(cd); return 0; } -size_t iconv(iconv_t cd, char **__restrict inbuf, - size_t *__restrict inbytesleft, - char **__restrict outbuf, size_t *__restrict outbytesleft) +size_t weak_function iconv(iconv_t cd, char **__restrict inbuf, + size_t *__restrict inbytesleft, + char **__restrict outbuf, + size_t *__restrict outbytesleft) { _UC_iconv_t *px = (_UC_iconv_t *) cd; size_t nrcount, r; @@ -1362,9 +1403,9 @@ size_t iconv(iconv_t cd, char **__restrict inbuf, return (size_t)(-1); } #ifdef __UCLIBC_MJN3_ONLY__ -#warning optimize this +#warning TODO: optimize this. #endif - if (p != NULL) { /* incomplet char case */ + if (p != NULL) { /* incomplete char case */ goto INVALID; } p = *inbuf + 1; /* nul */ @@ -1374,10 +1415,10 @@ size_t iconv(iconv_t cd, char **__restrict inbuf, if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */ goto ILLEGAL; } else { /* some other 8-bit ascii-extension codeset */ - const codeset_8_bit_t *c8b + const __codeset_8_bit_t *c8b = __locale_mmap->codeset_8_bit + px->fromcodeset - 3; wc -= 0x80; - wc = __global_locale.tbl8c2wc[ + wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[ (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT] << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; if (!wc) { @@ -1439,7 +1480,7 @@ size_t iconv(iconv_t cd, char **__restrict inbuf, r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1); if (r != (size_t)(-1)) { #ifdef __UCLIBC_MJN3_ONLY__ -#warning what happens for a nul? +#warning TODO: What happens for a nul? #endif if (r == 0) { if (wc != 0) { @@ -1458,14 +1499,14 @@ size_t iconv(iconv_t cd, char **__restrict inbuf, **outbuf = wc; } else { if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) { - const codeset_8_bit_t *c8b + const __codeset_8_bit_t *c8b = __locale_mmap->codeset_8_bit + px->tocodeset - 3; __uwchar_t u; u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)]; - u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + ((wc >> Cwc2c_TT_SHIFT) & ((1 << Cwc2c_TI_SHIFT)-1))]; - wc = __global_locale.tbl8wc2c[Cwc2c_TI_LEN + wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN + (u << Cwc2c_TT_SHIFT) + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; if (wc) { @@ -1497,7 +1538,7 @@ size_t iconv(iconv_t cd, char **__restrict inbuf, #include #include -extern const unsigned char codesets[]; +extern const unsigned char __iconv_codesets[]; #define IBUF BUFSIZ #define OBUF BUFSIZ @@ -1572,12 +1613,12 @@ int main(int argc, char **argv) if (opts[5]) { /* -l */ fprintf(stderr, "Recognized codesets:\n"); - for (s = codesets ; *s ; s += *s) { + for (s = __iconv_codesets ; *s ; s += *s) { fprintf(stderr," %s\n", s+2); } - s = CODESET_LIST; + s = __LOCALE_DATA_CODESET_LIST; do { - fprintf(stderr," %s\n", CODESET_LIST+ (unsigned char)(*s)); + fprintf(stderr," %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s)); } while (*++s); return EXIT_SUCCESS; diff --git a/libc/misc/wchar/wstdio.c b/libc/misc/wchar/wstdio.c index dfeb35c30..1069ee938 100644 --- a/libc/misc/wchar/wstdio.c +++ b/libc/misc/wchar/wstdio.c @@ -171,7 +171,6 @@ UNLOCKED(wint_t,fgetwc,(register FILE *stream),(stream)) size_t r; unsigned char c[1]; unsigned char sbuf[1]; - unsigned char ungot_width; /* Support ftell after wscanf ungetwc. */ wi = WEOF; /* Prepare for failure. */ @@ -183,8 +182,18 @@ UNLOCKED(wint_t,fgetwc,(register FILE *stream),(stream)) stream->modeflags |= __FLAG_WIDE; if (stream->modeflags & __MASK_UNGOT) {/* Any ungetwc()s? */ - assert( (stream->modeflags & (__FLAG_READING|__FLAG_ERROR)) - == __FLAG_READING); + + assert(stream->modeflags & __FLAG_READING); + +/* assert( (stream->modeflags & (__FLAG_READING|__FLAG_ERROR)) */ +/* == __FLAG_READING); */ + + if ((((stream->modeflags & __MASK_UNGOT) > 1) || stream->ungot[1])) { + stream->ungot_width[0] = 0; /* Application ungot... */ + } else { + stream->ungot_width[0] = stream->ungot_width[1]; /* scanf ungot */ + } + wi = stream->ungot[(--stream->modeflags) & __MASK_UNGOT]; stream->ungot[1] = 0; goto DONE; @@ -196,7 +205,9 @@ UNLOCKED(wint_t,fgetwc,(register FILE *stream),(stream)) ++stream->bufend; } - ungot_width = 0; + if (stream->state.mask == 0) { /* If last was a complete char */ + stream->ungot_width[0] = 0; /* then reset the width. */ + } LOOP: if ((n = stream->bufread - stream->bufpos) == 0) { @@ -204,12 +215,12 @@ UNLOCKED(wint_t,fgetwc,(register FILE *stream),(stream)) } r = mbrtowc(wc, stream->bufpos, n, &stream->state); - if (((ssize_t) r) >= 0) { /* Single byte... */ + if (((ssize_t) r) >= 0) { /* Success... */ if (r == 0) { /* Nul wide char... means 0 byte for us so */ ++r; /* increment r and handle below as single. */ } stream->bufpos += r; - stream->ungot_width[0] = ungot_width + r; + stream->ungot_width[0] += r; wi = *wc; goto DONE; } @@ -217,7 +228,7 @@ UNLOCKED(wint_t,fgetwc,(register FILE *stream),(stream)) if (r == ((size_t) -2)) { /* Potentially valid but incomplete and no more buffered. */ stream->bufpos += n; /* Update bufpos for stream. */ - ungot_width += n; + stream->ungot_width[0] += n; FILL_BUFFER: if (_stdio_fread(c, (size_t) 1, stream) > 0) { assert(stream->bufpos == stream->bufstart + 1); @@ -371,7 +382,8 @@ UNLOCKED(int,fputws,(const wchar_t *__restrict ws, #ifdef L_ungetwc /* * Note: This is the application-callable ungetwc. If wscanf calls this, it - * should also set stream->ungot[1] to 0 if this is the only ungot. + * should also set stream->ungot[1] to 0 if this is the only ungot, as well + * as reset stream->ungot_width[1] for use by _stdio_adjpos(). */ /* Reentrant. */ @@ -389,8 +401,7 @@ wint_t ungetwc(wint_t c, register FILE *stream) } stream->modeflags |= __FLAG_WIDE; - /* If can't read or there's been an error, or c == EOF, or ungot slots - * already filled, then return EOF */ + /* If can't read or c == WEOF or ungot slots already filled, then fail. */ if ((stream->modeflags & (__MASK_UNGOT2|__FLAG_WRITEONLY #ifndef __STDIO_AUTO_RW_TRANSITION @@ -406,14 +417,18 @@ wint_t ungetwc(wint_t c, register FILE *stream) /* ungot_width */ #ifdef __STDIO_BUFFERS - /* TODO: shouldn't allow writing??? */ +#ifdef __STDIO_AUTO_RW_TRANSITION if (stream->modeflags & __FLAG_WRITING) { fflush_unlocked(stream); /* Commit any write-buffered chars. */ } +#endif /* __STDIO_AUTO_RW_TRANSITION */ #endif /* __STDIO_BUFFERS */ /* Clear EOF and WRITING flags, and set READING FLAG */ stream->modeflags &= ~(__FLAG_EOF|__FLAG_WRITING); +#ifdef __UCLIBC_MJN3_ONLY__ +#warning CONSIDER: Is setting the reading flag after an ungetwc necessary? +#endif /* __UCLIBC_MJN3_ONLY__ */ stream->modeflags |= __FLAG_READING; stream->ungot[1] = 1; /* Flag as app ungetc call; wscanf fixes up. */ stream->ungot[(stream->modeflags++) & __MASK_UNGOT] = c; -- cgit v1.2.3