EVOLUTION-MANAGER
Edit File: avc_mbyte.c
/* $Id: avc_mbyte.c,v 1.4 2008/07/23 20:51:38 dmorissette Exp $ * * Name: avc_mbyte.c * Project: Arc/Info vector coverage (AVC) E00->BIN conversion library * Language: ANSI C * Purpose: Functions to handle multibyte character conversions. * Author: Daniel Morissette, dmorissette@dmsolutions.ca * ********************************************************************** * Copyright (c) 1999-2005, Daniel Morissette * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. ********************************************************************** * * $Log: avc_mbyte.c,v $ * Revision 1.4 2008/07/23 20:51:38 dmorissette * Fixed GCC 4.1.x compile warnings related to use of char vs unsigned char * (GDAL/OGR ticket http://trac.osgeo.org/gdal/ticket/2495) * * Revision 1.3 2005/06/03 03:49:59 daniel * Update email address, website url, and copyright dates * * Revision 1.2 2000/09/22 19:45:21 daniel * Switch to MIT-style license * * Revision 1.1 2000/05/29 15:31:03 daniel * Initial revision - Japanese support * **********************************************************************/ #include "avc.h" #ifdef _WIN32 # include <mbctype.h> #endif static int _AVCDetectJapaneseEncoding(const GByte *pszLine); static const GByte *_AVCJapanese2ArcDBCS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen); static const GByte *_AVCArcDBCS2JapaneseShiftJIS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen); /*===================================================================== * Functions to handle multibyte char conversions *====================================================================*/ #define IS_ASCII(c) ((c) < 0x80) /********************************************************************** * AVCAllocDBCSInfo() * * Alloc and init a new AVCDBCSInfo structure. **********************************************************************/ AVCDBCSInfo *AVCAllocDBCSInfo() { AVCDBCSInfo *psInfo; psInfo = (AVCDBCSInfo*)CPLCalloc(1, sizeof(AVCDBCSInfo)); psInfo->nDBCSCodePage = AVCGetDBCSCodePage(); psInfo->nDBCSEncoding = AVC_CODE_UNKNOWN; psInfo->pszDBCSBuf = NULL; psInfo->nDBCSBufSize = 0; return psInfo; } /********************************************************************** * AVCFreeDBCSInfo() * * Release all memory associated with a AVCDBCSInfo structure. **********************************************************************/ void AVCFreeDBCSInfo(AVCDBCSInfo *psInfo) { if (psInfo) { CPLFree(psInfo->pszDBCSBuf); CPLFree(psInfo); } } /********************************************************************** * AVCGetDBCSCodePage() * * Fetch current multibyte codepage on the system. * Returns a valid codepage number, or 0 if the codepage is single byte or * unsupported. **********************************************************************/ int AVCGetDBCSCodePage() { #ifdef _WIN32 int nCP; nCP = _getmbcp(); /* Check if that's a supported codepage */ if (nCP == AVC_DBCS_JAPANESE) return nCP; #endif return 0; } /********************************************************************** * AVCE00DetectEncoding() * * Try to detect the encoding used in the current file by examining lines * of input. * * Returns TRUE once the encoding is established, or FALSE if more lines * of input are required to establish the encoding. **********************************************************************/ GBool AVCE00DetectEncoding(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine) { if (psDBCSInfo == NULL || psDBCSInfo->nDBCSCodePage == 0 || psDBCSInfo->nDBCSEncoding != AVC_CODE_UNKNOWN) { /* Either single byte codepage, or encoding has already been detected */ return TRUE; } switch (psDBCSInfo->nDBCSCodePage) { case AVC_DBCS_JAPANESE: psDBCSInfo->nDBCSEncoding = _AVCDetectJapaneseEncoding(pszLine); break; default: psDBCSInfo->nDBCSEncoding = AVC_CODE_UNKNOWN; return TRUE; /* Codepage not supported... no need to scan more lines*/ } if (psDBCSInfo->nDBCSEncoding != AVC_CODE_UNKNOWN) return TRUE; /* We detected the encoding! */ return FALSE; } /********************************************************************** * AVCE00Convert2ArcDBCS() * * If encoding is still unknown, try to detect the encoding used in the * current file, and then convert the string to an encoding validfor output * to a coverage. * * Returns a reference to a const buffer that should not be freed by the * caller. It can be either the original string buffer or a ref. to an * internal buffer. **********************************************************************/ const GByte *AVCE00Convert2ArcDBCS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen) { const GByte *pszOutBuf = NULL; GByte *pszTmp = NULL; GBool bAllAscii; if (psDBCSInfo == NULL || psDBCSInfo->nDBCSCodePage == 0 || pszLine == NULL) { /* Single byte codepage... nothing to do */ return pszLine; } /* If string is all ASCII then there is nothing to do... */ pszTmp = (GByte *)pszLine; for(bAllAscii = TRUE ; bAllAscii && pszTmp && *pszTmp; pszTmp++) { if ( !IS_ASCII(*pszTmp) ) bAllAscii = FALSE; } if (bAllAscii) return pszLine; /* Make sure output buffer is large enough. * We add 2 chars to buffer size to simplify processing... no need to * check if second byte of a pair would overflow buffer. */ if (psDBCSInfo->pszDBCSBuf == NULL || psDBCSInfo->nDBCSBufSize < nMaxOutputLen+2) { psDBCSInfo->nDBCSBufSize = nMaxOutputLen+2; psDBCSInfo->pszDBCSBuf = (GByte *)CPLRealloc(psDBCSInfo->pszDBCSBuf, psDBCSInfo->nDBCSBufSize* sizeof(GByte)); } /* Do the conversion according to current code page */ switch (psDBCSInfo->nDBCSCodePage) { case AVC_DBCS_JAPANESE: pszOutBuf = _AVCJapanese2ArcDBCS(psDBCSInfo, pszLine, nMaxOutputLen); break; default: /* We should never get here anyways, but just in case return pszLine */ CPLAssert( !"SHOULD NEVER GET HERE" ); pszOutBuf = pszLine; } return pszOutBuf; } /********************************************************************** * AVCE00ConvertFromArcDBCS() * * Convert DBCS encoding in binary coverage files to E00 encoding. * * Returns a reference to a const buffer that should not be freed by the * caller. It can be either the original string buffer or a ref. to an * internal buffer. **********************************************************************/ const GByte *AVCE00ConvertFromArcDBCS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen) { const GByte *pszOutBuf = NULL; GByte *pszTmp; GBool bAllAscii; if (psDBCSInfo == NULL || psDBCSInfo->nDBCSCodePage == 0 || pszLine == NULL) { /* Single byte codepage... nothing to do */ return pszLine; } /* If string is all ASCII then there is nothing to do... */ pszTmp = (GByte *)pszLine; for(bAllAscii = TRUE ; bAllAscii && pszTmp && *pszTmp; pszTmp++) { if ( !IS_ASCII(*pszTmp) ) bAllAscii = FALSE; } if (bAllAscii) return pszLine; /* Make sure output buffer is large enough. * We add 2 chars to buffer size to simplify processing... no need to * check if second byte of a pair would overflow buffer. */ if (psDBCSInfo->pszDBCSBuf == NULL || psDBCSInfo->nDBCSBufSize < nMaxOutputLen+2) { psDBCSInfo->nDBCSBufSize = nMaxOutputLen+2; psDBCSInfo->pszDBCSBuf = (GByte *)CPLRealloc(psDBCSInfo->pszDBCSBuf, psDBCSInfo->nDBCSBufSize* sizeof(GByte)); } /* Do the conversion according to current code page */ switch (psDBCSInfo->nDBCSCodePage) { case AVC_DBCS_JAPANESE: pszOutBuf = _AVCArcDBCS2JapaneseShiftJIS(psDBCSInfo, pszLine, nMaxOutputLen); break; default: /* We should never get here anyways, but just in case return pszLine */ pszOutBuf = pszLine; } return pszOutBuf; } /*===================================================================== *===================================================================== * Functions Specific to Japanese encoding (CodePage 932). * * For now we assume that we can receive only Katakana, Shift-JIS, or EUC * encoding as input. Coverages use EUC encoding in most cases, except * for Katakana characters that are prefixed with a 0x8e byte. * * Most of the Japanese conversion functions are based on information and * algorithms found at: * http://www.mars.dti.ne.jp/~torao/program/appendix/japanese-en.html *===================================================================== *====================================================================*/ /********************************************************************** * _AVCDetectJapaneseEncoding() * * Scan a line of text to try to establish the type of japanese encoding * * Returns the encoding number (AVC_CODE_JAP_*), or AVC_CODE_UNKNOWN if no * specific encoding was detected. **********************************************************************/ #define IS_JAP_SHIFTJIS_1(c) ((c) >= 0x81 && (c) <= 0x9f) #define IS_JAP_SHIFTJIS_2(c) (((c) >= 0x40 && (c) <= 0x7e) || \ ((c) >= 0x80 && (c) <= 0xA0) ) #define IS_JAP_EUC_1(c) ((c) >= 0xF0 && (c) <= 0xFE) #define IS_JAP_EUC_2(c) ((c) >= 0xFD && (c) <= 0xFE) #define IS_JAP_KANA(c) ((c) >= 0xA1 && (c) <= 0xDF) static int _AVCDetectJapaneseEncoding(const GByte *pszLine) { int nEncoding = AVC_CODE_UNKNOWN; for( ; nEncoding == AVC_CODE_UNKNOWN && pszLine && *pszLine; pszLine++) { if (IS_ASCII(*pszLine)) continue; else if (IS_JAP_SHIFTJIS_1(*pszLine)) { nEncoding = AVC_CODE_JAP_SHIFTJIS; break; } else if (IS_JAP_KANA(*pszLine) && *(pszLine+1) && (IS_ASCII(*(pszLine+1)) || (*(pszLine+1)>=0x80 && *(pszLine+1)<=0xA0) ) ) { nEncoding = AVC_CODE_JAP_SHIFTJIS; /* SHIFT-JIS + Kana */ break; } else if (IS_JAP_EUC_1(*pszLine)) { nEncoding = AVC_CODE_JAP_EUC; break; } if (*(++pszLine) == '\0') break; if (IS_JAP_SHIFTJIS_2(*pszLine)) { nEncoding = AVC_CODE_JAP_SHIFTJIS; break; } else if (IS_JAP_EUC_2(*pszLine)) { nEncoding = AVC_CODE_JAP_EUC; break; } } return nEncoding; } /********************************************************************** * _AVCJapanese2ArcDBCS() * * Try to detect type of Japanese encoding if not done yet, and convert * string from Japanese to proper coverage DBCS encoding. **********************************************************************/ static const GByte *_AVCJapanese2ArcDBCS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen) { GByte *pszOut; int iDst; pszOut = psDBCSInfo->pszDBCSBuf; if (psDBCSInfo->nDBCSEncoding == AVC_CODE_UNKNOWN) { /* Type of encoding (Shift-JIS or EUC) not known yet... try to * detect it now. */ psDBCSInfo->nDBCSEncoding = _AVCDetectJapaneseEncoding(pszLine); /* if (psDBCSInfo->nDBCSEncoding == AVC_CODE_JAP_SHIFTJIS) { printf("Found Japanese Shift-JIS encoding\n"); } else if (psDBCSInfo->nDBCSEncoding == AVC_CODE_JAP_EUC) { printf("Found Japanese EUC encoding\n"); } */ } for(iDst=0; *pszLine && iDst < nMaxOutputLen; pszLine++) { if (IS_ASCII(*pszLine)) { /* No transformation required for ASCII */ pszOut[iDst++] = *pszLine; } else if ( psDBCSInfo->nDBCSEncoding==AVC_CODE_JAP_EUC && *(pszLine+1) ) { /* This must be a pair of EUC chars and both should be in * the range 0xA1-0xFE */ pszOut[iDst++] = *(pszLine++); pszOut[iDst++] = *pszLine; } else if ( IS_JAP_KANA(*pszLine) ) { /* Katakana char. prefix it with 0x8e */ pszOut[iDst++] = 0x8e; pszOut[iDst++] = *pszLine; } else if ( *(pszLine+1) ) { /* This must be a pair of Shift-JIS chars... convert them to EUC * * If we haven't been able to establish the encoding for sure * yet, then it is possible that a pair of EUC chars could be * treated as shift-JIS here... but there is not much we can do * about that unless we scan the whole E00 input before we * start the conversion. */ unsigned char leader, trailer; leader = *(pszLine++); trailer = *pszLine; if(leader <= 0x9F) leader -= 0x71; else leader -= 0xB1; leader = (leader << 1) + 1; if(trailer > 0x7F) trailer --; if(trailer >= 0x9E) { trailer -= 0x7D; leader ++; } else { trailer -= 0x1F; } pszOut[iDst++] = leader | 0x80; pszOut[iDst++] = trailer | 0x80; } else { /* We should never get here unless a double-byte pair was * truncated... but just in case... */ pszOut[iDst++] = *pszLine; } } pszOut[iDst] = '\0'; return psDBCSInfo->pszDBCSBuf; } /********************************************************************** * _AVCArcDBCS2JapaneseShiftJIS() * * Convert string from coverage DBCS (EUC) to Japanese Shift-JIS. * * We know that binary coverages use a custom EUC encoding for japanese * which is EUC + all Katakana chars are prefixed with 0x8e. So this * function just does a simple conversion. **********************************************************************/ static const GByte *_AVCArcDBCS2JapaneseShiftJIS(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine, int nMaxOutputLen) { GByte *pszOut; int iDst; pszOut = psDBCSInfo->pszDBCSBuf; for(iDst=0; *pszLine && iDst < nMaxOutputLen; pszLine++) { if (IS_ASCII(*pszLine)) { /* No transformation required for ASCII */ pszOut[iDst++] = *pszLine; } else if (*pszLine == 0x8e && *(pszLine+1)) { pszLine++; /* Flush the 0x8e */ pszOut[iDst++] = *pszLine; } else if (*(pszLine+1)) { /* This is a pair of EUC chars... convert them to Shift-JIS */ unsigned char leader, trailer; leader = *(pszLine++) & 0x7F; trailer = *pszLine & 0x7F; if((leader & 0x01) != 0) trailer += 0x1F; else trailer += 0x7D; if(trailer >= 0x7F) trailer ++; leader = ((leader - 0x21) >> 1) + 0x81; if(leader > 0x9F) leader += 0x40; pszOut[iDst++] = leader; pszOut[iDst++] = trailer; } else { /* We should never get here unless a double-byte pair was * truncated... but just in case... */ pszOut[iDst++] = *pszLine; } } pszOut[iDst] = '\0'; return psDBCSInfo->pszDBCSBuf; }