/* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 2005-2022 The R Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, a copy is available at * https://www.R-project.org/Licenses/ */ /* The original version of this file was contributed in 2005 by Ei-ji * Nakama, along with rlocale_data.h and ../include/rlocale.h. * * The naming is misleading: apart from the width data this is not * locale-specfic. It is rather about the use of non-Latin * characters (including symbols, emojis ...). * * It provides replacements for the wctype (iswxxxxx) functions on * Windows (where they are not correct in e.g. Japanese) * AIX (missing) * macOS in CJK (where these just called the ctype functions) * * It also provides wc[s]width, where widths of CJK fonts are often * wrong in vendor-supplied versions and in Markus Kuhn's version * used for Windows in R 2.[12].x. */ #ifdef HAVE_CONFIG_H #include #endif #ifdef HAVE_VISIBILITY_ATTRIBUTE # define attribute_hidden __attribute__ ((visibility ("hidden"))) #else # define attribute_hidden #endif #include #include #define IN_RLOCALE_C 1 /* used in rlocale.h */ #include #include #include #include #include #include #include #include /* for localeCP */ #if defined(USE_RI18N_WIDTH) || defined(USE_RI18N_FNS) /* used for zero-width table and in rlocale_data.h */ struct interval { int first; int last; }; // This seems based on Markus Kuhn's function but with 1-based 'max' static int wcsearch(int wint, const struct interval *table, int max) { int min = 0; int mid; max--; if (wint < table[0].first || wint > table[max].last) return 0; while (max >= min) { mid = (min + max) / 2; if (wint > table[mid].last) min = mid + 1; else if (wint < table[mid].first) max = mid - 1; else return 1; } return 0; } #endif // ------------------------- width functions -------------------- #ifdef USE_RI18N_WIDTH /* wcwidth and wcswidth, from POSIX 2001 (formerly in some draft C * standards but not implemented in Windows). One could argue that * the width of non-printable / unassigned characters is immaterial * (they will be represented by escapes) so could be given a * conventional value such as 0 or 1. POSIX suggests returning -1 for * non-printable characters, but these were not written that way in * 2005. * * It is not always clear what to do for unassigned code points * (especially 'private use' ones). * * Although what a character represents may be locale-specific, * reference images are available at * https://www.unicode.org/charts/PDF/ whose width can be assessed. * * There is a problem with character-by-character appoaches: apart * from surrogate pairs, some glyphs are defined in combinations of * others. For human languages this is largely (but not entirely) * covered by giving combining characters zero width. However, quite * a few emoji are defined as combinations of others: e.g. * 'polar bear' is bear+snowflake with a zero-width joiner, * "\U1f43b\u200d\u2744" which gets width 2+0+1. * (https://emojipedia.org/polar-bear/, * https://emojipedia.org/emoji-zwj-sequence/). There are a few such * in human languages: * https://en.wikipedia.org/wiki/Zero-width_joiner. */ #include "rlocale_widths.h" static int wcwidthsearch(int wint, const struct interval_wcwidth *table, int max, int locale) { int min = 0; int mid; max--; /* This quickly gives one for printing ASCII characters */ if (wint > 0x1F && wint < 0x7F) return 1; else if (wint < table[min].first || wint > table[max].last) return -1; while (max >= min) { mid = (min + max) / 2; if (wint > table[mid].last) min = mid + 1; else if (wint < table[mid].first) max = mid - 1; else{ return(table[mid].mb[locale]); } } return -1; } /* The idea here here has never been explained. See also the comments in ../include/rlocale.h. That does not explain the separate entries for Singapore (simplified) and Hong Kong/Macau (traditional) where it seems the Windows system font is not different from PRC/Taiwan respectively, nor what font was used for non-Windows, nor where the values came from. Except perhaps on macOS, the non-Windows locale names are for the default MBCS encodings (e.g. GBK, GB1312, BIG5, EUCJP, EUCKR). There are other non-UTF-8 encodings for those locales, e.g. ja_JP.SJIS, ko_KR.CP949, zh_CN.eucCN, zh_HK.Big5HKSCS. */ typedef struct { char *name; int locale; } cjk_locale_name_t; static cjk_locale_name_t cjk_locale_name[] = { // Windows locale names {"CHINESE(SINGAPORE)_SIGNAPORE", MB_zh_SG}, {"CHINESE_SIGNAPORE", MB_zh_SG}, {"CHINESE(PRC)_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN}, {"CHINESE_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN}, {"CHINESE_MACAU S.A.R.", MB_zh_HK}, {"CHINESE(PRC)_HONG KONG", MB_zh_HK}, {"CHINESE_HONG KONG S.A.R.", MB_zh_HK}, {"CHINESE(TAIWAN)_TAIWAN", MB_zh_TW}, {"CHINESE_TAIWAN", MB_zh_TW}, {"CHINESE-S", MB_zh_CN}, {"CHINESE-T", MB_zh_TW}, {"JAPANESE_JAPAN", MB_ja_JP}, {"JAPANESE", MB_ja_JP}, {"KOREAN_KOREA", MB_ko_KR}, {"KOREAN", MB_ko_KR}, // Other OSes, but only in default encodings. {"ZH_TW", MB_zh_TW}, {"ZH_CN", MB_zh_CN}, {"ZH_CN.BIG5", MB_zh_TW}, {"ZH_HK", MB_zh_HK}, {"ZH_SG", MB_zh_SG}, {"JA_JP", MB_ja_JP}, {"KO_KR", MB_ko_KR}, {"ZH", MB_zh_CN}, {"JA", MB_ja_JP}, {"KO", MB_ko_KR}, // Default, where all EA Ambiguous characters have width one. {"", MB_Default}, }; static int get_locale_id() { char lc_str[128]; unsigned int i, j; strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str) - 1); lc_str[sizeof(lc_str) - 1] = '\0'; for (i = 0, j = (int) strlen(lc_str); i < j && i < sizeof(lc_str); i++) lc_str[i] = (char) toupper(lc_str[i]); for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t)); i++) { if (0 == strncmp(cjk_locale_name[i].name, lc_str, strlen(cjk_locale_name[i].name))) return cjk_locale_name[i].locale; } return 0; } /* used in character.c, ../gnuwin32/console.c (for an MBCS locale) , ../library/grDevices/src/devP*.c Unlike the POSIX description this does not return -1 for non-printable Unicode points. NB: Windows (at least MinGW-W64) does not have this function. */ int Ri18n_wcwidth(R_wchar_t c) { /* This quickly gives one for printing ASCII characters */ if (c > 0x1F && c < 0x7F) return 1; static int lc = 0; #ifdef Win32 static int localeCP_lc = -1; if (localeCP_lc != localeCP) { lc = get_locale_id(); localeCP_lc = localeCP; } #else static char encname_lc[R_CODESET_MAX + 1] = ""; if (strcmp(encname_lc, R_nativeEncoding())) { lc = get_locale_id(); strncpy(encname_lc, R_nativeEncoding(), R_CODESET_MAX); encname_lc[R_CODESET_MAX] = '\0'; } #endif int wd = wcwidthsearch(c, table_wcwidth, (sizeof(table_wcwidth)/sizeof(struct interval_wcwidth)), lc); if (wd >= 0) return wd; // currently all are 1 or 2. int zw = wcsearch(c, zero_width, zero_width_count); return zw ? 0 : 1; // assume unknown chars are width one. } /* Used in character.c, errors.c, ../gnuwin32/console.c (for an MBCS locale) Strings in R are restricted to 2^31-1 bytes but could conceivably have a width exceeding that. Unlike the POSIX description this does not return -1 for strings containing non-printable Unicode points. NB: Windows (at least MinGW-W64) does not have this function. */ attribute_hidden int Ri18n_wcswidth (const wchar_t *wc, size_t n) { int rs = 0; while ((n-- > 0) && (*wc != L'\0')) { if (IS_SURROGATE_PAIR(*wc, *(wc+1))) { /* surrogate pairs should only occur with 'short' wchar_t, * that is Windows and perhaps 32-bit AIX */ R_wchar_t val = ((*wc & 0x3FF) << 10) + (*(wc+1) & 0x3FF) + 0x010000; int now = Ri18n_wcwidth (val); if (now == -1) return -1; rs += now; wc += 2; } else { int now = Ri18n_wcwidth (*wc); if (now == -1) return -1; rs += now; wc++; } } return rs; } #endif // ------------------- end of width functions -------------------- /********************************************************************* * macOS's wide character type functions are based on NetBSD * and only work(ed) correctly for Latin-1 characters. * So we replace them. May also be needed on *BSD, and are on AIX ********************************************************************/ #ifdef USE_RI18N_FNS # define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \ { \ return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count); \ } #include "rlocale_data.h" /* These are the functions which C99 and POSIX define. However, not all are used elsewhere in R (so are static), but they are used in Ri18n_iswctype. */ ISWFUNC(upper) ISWFUNC(lower) ISWFUNC(alpha) ISWFUNC(digit) ISWFUNC(xdigit) ISWFUNC(space) ISWFUNC(print) /* derived below from print and space ISWFUNC(graph) */ ISWFUNC(blank) ISWFUNC(cntrl) ISWFUNC(punct) /* defined below in terms of digit and alpha ISWFUNC(alnum) */ wctype_t Ri18n_wctype(const char *); int Ri18n_iswctype(wint_t, wctype_t); static int Ri18n_iswalnum (wint_t wc) { return (Ri18n_iswctype(wc, Ri18n_wctype("digit")) || Ri18n_iswctype(wc, Ri18n_wctype("alpha")) ); } /* Defined in the C99 standard as 'any wide character for which iswprint is true and iswspace is false' As this is used rarely (and iswprint is used a lot), we chose to derive this one. */ static int Ri18n_iswgraph (wint_t wc) { return (Ri18n_iswctype(wc, Ri18n_wctype("print")) && !Ri18n_iswctype(wc, Ri18n_wctype("space")) ); } /* * iswctype */ typedef struct { char * name; wctype_t wctype; int(*func)(wint_t); } Ri18n_wctype_func_l ; static const Ri18n_wctype_func_l Ri18n_wctype_func[] = { {"upper", 1<<0, Ri18n_iswupper}, {"lower", 1<<1, Ri18n_iswlower}, {"alpha", 1<<2, Ri18n_iswalpha}, {"digit", 1<<3, Ri18n_iswdigit}, {"xdigit", 1<<4, Ri18n_iswxdigit}, {"space", 1<<5, Ri18n_iswspace}, {"print", 1<<6, Ri18n_iswprint}, {"graph", 1<<7, Ri18n_iswgraph}, {"blank", 1<<8, Ri18n_iswblank}, {"cntrl", 1<<9, Ri18n_iswcntrl}, {"punct", 1<<10, Ri18n_iswpunct}, {"alnum", 1<<11, Ri18n_iswalnum}, {NULL, 0, NULL} }; /* These two are used (via macros) in X11 dataentry so need to be visible. */ wctype_t Ri18n_wctype(const char *name) { int i; for (i = 0 ; Ri18n_wctype_func[i].name != NULL && 0 != strcmp(Ri18n_wctype_func[i].name, name) ; i++ ); return Ri18n_wctype_func[i].wctype; } int Ri18n_iswctype(wint_t wc, wctype_t desc) { int i; for (i = 0 ; Ri18n_wctype_func[i].wctype != 0 && Ri18n_wctype_func[i].wctype != desc ; i++ ); return (*Ri18n_wctype_func[i].func)(wc); } #endif // ------------------------- tolower/upper functions -------------------- #ifdef USE_RI18N_CASE /* These tables were prepared by the R code tab <- read.table('UnicodeData.txt', sep = ';', header = FALSE) tab <- tab[, c("V1", "V13", "V14")] names(tab) <- c('pt', 'uc', 'lc') toupper <- tab[tab$uc !="", 1:2] tolower <- tab[tab$lc !="", c(1,3)] cat(with(toupper, sprintf(" { 0x%s, 0x%s },", pt, uc)), sep = "\n", file = "rlocale_toupper.h") cat(with(tolower, sprintf(" { 0x%s, 0x%s },", pt, lc)), sep = "\n", file = "rlocale_tolower.h") from https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt */ struct pair {int from; int to;}; static const struct pair table_toupper[] = { #include "rlocale_toupper.h" }; static const struct pair table_tolower[] = { #include "rlocale_tolower.h" }; static int tlsearch(int wint, const struct pair *table, int max) { int min = 0, mid; max--; if (wint < table[0].from || wint > table[max].from) return -1; while (max >= min) { mid = (min + max) / 2; if (wint > table[mid].from) min = mid + 1; else if (wint < table[mid].from) max = mid - 1; else return table[mid].to; } return -1; } R_wchar_t Ri18n_towupper(R_wchar_t wc) { int res = tlsearch(wc, table_toupper, sizeof(table_toupper)/sizeof(struct pair)); return (res >= 0 ? res : wc); } R_wchar_t Ri18n_towlower(R_wchar_t wc) { int res = tlsearch(wc, table_tolower, sizeof(table_tolower)/sizeof(struct pair)); return (res >= 0 ? res : wc); } // ----------------- end of tolower/upper functions -------------------- #endif