EVOLUTION-MANAGER
Edit File: cpl_recode.cpp
/********************************************************************** * $Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $ * * Name: cpl_recode.cpp * Project: CPL - Common Portability Library * Purpose: Character set recoding and char/wchar_t conversions. * Author: Andrey Kiselev, dron@ak4719.spb.edu * ********************************************************************** * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu> * Copyright (c) 2008, Frank Warmerdam * Copyright (c) 2011-2014, Even Rouault <even dot rouault at mines-paris dot org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. **********************************************************************/ #include "cpl_string.h" CPL_CVSID("$Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $"); #ifdef CPL_RECODE_ICONV extern void CPLClearRecodeIconvWarningFlags(); extern char *CPLRecodeIconv( const char *, const char *, const char * ); extern char *CPLRecodeFromWCharIconv( const wchar_t *, const char *, const char * ); extern wchar_t *CPLRecodeToWCharIconv( const char *, const char *, const char * ); #endif /* CPL_RECODE_ICONV */ extern void CPLClearRecodeStubWarningFlags(); extern char *CPLRecodeStub( const char *, const char *, const char * ); extern char *CPLRecodeFromWCharStub( const wchar_t *, const char *, const char * ); extern wchar_t *CPLRecodeToWCharStub( const char *, const char *, const char * ); extern int CPLIsUTF8Stub( const char *, int ); /************************************************************************/ /* CPLRecode() */ /************************************************************************/ /** * Convert a string from a source encoding to a destination encoding. * * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported : * <ul> * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li> * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li> * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li> * </ul> * * If an error occurs an error may, or may not be posted with CPLError(). * * @param pszSource a NULL terminated string. * @param pszSrcEncoding the source encoding. * @param pszDstEncoding the destination encoding. * * @return a NULL terminated string which should be freed with CPLFree(). * * @since GDAL 1.6.0 */ char CPL_DLL *CPLRecode( const char *pszSource, const char *pszSrcEncoding, const char *pszDstEncoding ) { /* -------------------------------------------------------------------- */ /* Handle a few common short cuts. */ /* -------------------------------------------------------------------- */ if ( EQUAL(pszSrcEncoding, pszDstEncoding) ) return CPLStrdup(pszSource); if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII) && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8) || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) ) return CPLStrdup(pszSource); #ifdef CPL_RECODE_ICONV /* -------------------------------------------------------------------- */ /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */ /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled */ /* very well by the stub implementation which is faster than the */ /* iconv() route. Use a stub for these two ones and iconv() */ /* everything else. */ /* -------------------------------------------------------------------- */ if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) && EQUAL(pszDstEncoding, CPL_ENC_UTF8) ) || ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8) && EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) ) { return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding ); } else { return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding ); } #else /* CPL_RECODE_STUB */ return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding ); #endif /* CPL_RECODE_ICONV */ } /************************************************************************/ /* CPLRecodeFromWChar() */ /************************************************************************/ /** * Convert wchar_t string to UTF-8. * * Convert a wchar_t string into a multibyte utf-8 string. The only * guaranteed supported source encoding is CPL_ENC_UCS2, and the only * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII * and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings * may also be supported. * * Note that the wchar_t type varies in size on different systems. On * win32 it is normally 2 bytes, and on unix 4 bytes. * * If an error occurs an error may, or may not be posted with CPLError(). * * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t. * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2. * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8. * * @return a zero terminated multi-byte string which should be freed with * CPLFree(), or NULL if an error occurs. * * @since GDAL 1.6.0 */ char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource, const char *pszSrcEncoding, const char *pszDstEncoding ) { #ifdef CPL_RECODE_ICONV /* -------------------------------------------------------------------- */ /* Conversions from CPL_ENC_UCS2 */ /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ /* handled by the stub implementation. */ /* -------------------------------------------------------------------- */ if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T")) && ( EQUAL(pszDstEncoding, CPL_ENC_UTF8) || EQUAL(pszDstEncoding, CPL_ENC_ASCII) || EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) ) { return CPLRecodeFromWCharStub( pwszSource, pszSrcEncoding, pszDstEncoding ); } else { return CPLRecodeFromWCharIconv( pwszSource, pszSrcEncoding, pszDstEncoding ); } #else /* CPL_RECODE_STUB */ return CPLRecodeFromWCharStub( pwszSource, pszSrcEncoding, pszDstEncoding ); #endif /* CPL_RECODE_ICONV */ } /************************************************************************/ /* CPLRecodeToWChar() */ /************************************************************************/ /** * Convert UTF-8 string to a wchar_t string. * * Convert a 8bit, multi-byte per character input string into a wide * character (wchar_t) string. The only guaranteed supported source encodings * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source * and destination encodings may be supported depending on the underlying * implementation. * * Note that the wchar_t type varies in size on different systems. On * win32 it is normally 2 bytes, and on unix 4 bytes. * * If an error occurs an error may, or may not be posted with CPLError(). * * @param pszSource input multi-byte character string. * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8. * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2. * * @return the zero terminated wchar_t string (to be freed with CPLFree()) or * NULL on error. * * @since GDAL 1.6.0 */ wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource, const char *pszSrcEncoding, const char *pszDstEncoding ) { #ifdef CPL_RECODE_ICONV /* -------------------------------------------------------------------- */ /* Conversions to CPL_ENC_UCS2 */ /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */ /* handled by the stub implementation. */ /* -------------------------------------------------------------------- */ if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T")) && ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8) || EQUAL(pszSrcEncoding, CPL_ENC_ASCII) || EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) ) { return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding ); } else { return CPLRecodeToWCharIconv( pszSource, pszSrcEncoding, pszDstEncoding ); } #else /* CPL_RECODE_STUB */ return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding ); #endif /* CPL_RECODE_ICONV */ } /************************************************************************/ /* CPLIsUTF8() */ /************************************************************************/ /** * Test if a string is encoded as UTF-8. * * @param pabyData input string to test * @param nLen length of the input string, or -1 if the function must compute * the string length. In which case it must be null terminated. * @return TRUE if the string is encoded as UTF-8. FALSE otherwise * * @since GDAL 1.7.0 */ int CPLIsUTF8(const char* pabyData, int nLen) { return CPLIsUTF8Stub( pabyData, nLen ); } /************************************************************************/ /* CPLForceToASCII() */ /************************************************************************/ /** * Return a new string that is made only of ASCII characters. If non-ASCII * characters are found in the input string, they will be replaced by the * provided replacement character. * * @param pabyData input string to test * @param nLen length of the input string, or -1 if the function must compute * the string length. In which case it must be null terminated. * @param chReplacementChar character which will be used when the input stream * contains a non ASCII character. Must be valid ASCII ! * * @return a new string that must be freed with CPLFree(). * * @since GDAL 1.7.0 */ char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar) { if (nLen < 0) nLen = strlen(pabyData); char* pszOutputString = (char*)CPLMalloc(nLen + 1); int i; for(i=0;i<nLen;i++) { if (((unsigned char*)pabyData)[i] > 127) pszOutputString[i] = chReplacementChar; else pszOutputString[i] = pabyData[i]; } pszOutputString[i] = '\0'; return pszOutputString; } /************************************************************************/ /* CPLEncodingCharSize() */ /************************************************************************/ /** * Return bytes per character for encoding. * * This function returns the size in bytes of the smallest character * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this * is straight forward. For encodings like UTF8 and UTF16 which represent * some characters as a sequence of atomic character sizes the function * still returns the atomic character size (1 for UTF8, 2 for UTF16). * * This function will return the correct value for well known encodings * with corresponding CPL_ENC_ values. It may not return the correct value * for other encodings even if they are supported by the underlying iconv * or windows transliteration services. Hopefully it will improve over time. * * @param pszEncoding the name of the encoding. * * @return the size of a minimal character in bytes or -1 if the size is * unknown. */ int CPLEncodingCharSize( const char *pszEncoding ) { if( EQUAL(pszEncoding,CPL_ENC_UTF8) ) return 1; else if( EQUAL(pszEncoding,CPL_ENC_UTF16) ) return 2; else if( EQUAL(pszEncoding,CPL_ENC_UCS2) ) return 2; else if( EQUAL(pszEncoding,CPL_ENC_UCS4) ) return 4; else if( EQUAL(pszEncoding,CPL_ENC_ASCII) ) return 1; else if( EQUALN(pszEncoding,"ISO-8859-",9) ) return 1; else return -1; } /************************************************************************/ /* CPLClearRecodeWarningFlags() */ /************************************************************************/ void CPLClearRecodeWarningFlags() { #ifdef CPL_RECODE_ICONV CPLClearRecodeIconvWarningFlags(); #endif CPLClearRecodeStubWarningFlags(); } /************************************************************************/ /* CPLStrlenUTF8() */ /************************************************************************/ /** * Return the number of UTF-8 characters of a nul-terminated string. * * This is different from strlen() which returns the number of bytes. * * @param pszUTF8Str a nul-terminated UTF-8 string * * @return the number of UTF-8 characters. */ int CPLStrlenUTF8(const char *pszUTF8Str) { int i = 0, j = 0; while (pszUTF8Str[i]) { if ((pszUTF8Str[i] & 0xc0) != 0x80) j++; i++; } return j; }