/* Copyright 2001-2018 John Wiseman G8BPQ This file is part of LinBPQ/BPQ32. LinBPQ/BPQ32 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. LinBPQ/BPQ32 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with LinBPQ/BPQ32. If not, see http://www.gnu.org/licenses */ // Routines to convert to and from UTF8 #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers #define _CRT_SECURE_NO_DEPRECATE #ifndef WIN32 #define VOID void #define BOOL int #define TRUE 1 #define FALSE 0 #include <string.h> #else #include <windows.h> #endif int convUTF8 = 0; unsigned int CP437toUTF8Data[128] = { 34755, 48323, 43459, 41667, 42179, 41155, 42435, 42947, 43715, 43971, 43203, 44995, 44739, 44227, 33987, 34243, 35267, 42691, 34499, 46275, // 90 46787, 45763, 48067, 47555, 49091, 38595, 40131, 41666, 41922, 42434, 10978018, 37574, 41411, 44483, 46019, 47811, // A0 45507, 37315, 43714, 47810, 49090, 9473250, 44226, 48578, 48322, 41410, 43970, 48066, 9541346, 9606882, 9672418, 8557794, //B0 10786018, 10589666, 10655202, 9868770, 9803234, 10720738, 9541090, 9934306, 10327522, 10261986, 10196450, 9475298, 9737442, 11834594, 11310306, 10261730, //C0 8426722, 12358882, 10393058, 10458594, 10130914, 9737698, 11113954, 10917346, 10524130, 9475554, 11310562, 10982882, 11048418, 10786274, 10851810, 10065378, //D0 9999842, 9606626, 9672162, 11245026, 11179490, 9999586, 9213154, 8951522, 8689378, 9213666, 9475810, 8427234, 45518, 40899, 37838, 32975, // E0 41934, 33743, 46530, 33999, 42702, 39118, 43470, 46286, 10389730, 34511, 46542, 11110626, 10586594, 45506, 10848738, 10783202, // F0 10521826, 10587362, 47043, 8948194, 45250, 10062050, 47042, 10127586, 12550626, 45762, 10524386, 41154, }; unsigned int CP437toUTF8DataLen[128] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, }; unsigned int CP1251toUTF8Data[128] = { 33488, 33744, 10125538, 37841, 10387682, 10911970, 10518754, 10584290, 11305698, 11567330, 35280, 12157154, 35536, 36048, 35792, 36816, 37585, 9994466, 10060002, 10256610, 10322146, 10649826, 9666786, 9732322, 39106, 10650850, 39377, 12222690, 39633, 40145, 39889, 40913, 41154, 36560, 40657, 35024, 42178, 37074, 42690, 42946, 33232, 43458, 34000, 43970, 44226, 44482, 44738, 34768, 45250, 45506, 34512, 38609, 37330, 46530, 46786, 47042, 37329, 9864418, 38097, 48066, 39121, 34256, 38353, 38865, 37072, 37328, 37584, 37840, 38096, 38352, 38608, 38864, 39120, 39376, 39632, 39888, 40144, 40400, 40656, 40912, 41168, 41424, 41680, 41936, 42192, 42448, 42704, 42960, 43216, 43472, 43728, 43984, 44240, 44496, 44752, 45008, 45264, 45520, 45776, 46032, 46288, 46544, 46800, 47056, 47312, 47568, 47824, 48080, 48336, 48592, 48848, 49104, 32977, 33233, 33489, 33745, 34001, 34257, 34513, 34769, 35025, 35281, 35537, 35793, 36049, 36305, 36561, 36817, }; unsigned int CP1251toUTF8DataLen[128] = { 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, }; unsigned int CP1252toUTF8Data[128] = { 11305698, 33218, 10125538, 37574, 10387682, 10911970, 10518754, 10584290, 34507, 11567330, 41157, 12157154, 37573, 36290, 48581, 36802, 37058, 9994466, 10060002, 10256610, 10322146, 10649826, 9666786, 9732322, 40139, 10650850, 41413, 12222690, 37829, 40386, 48837, 47301, 41154, 41410, 41666, 41922, 42178, 42434, 42690, 42946, 43202, 43458, 43714, 43970, 44226, 44482, 44738, 44994, 45250, 45506, 45762, 46018, 46274, 46530, 46786, 47042, 47298, 47554, 47810, 48066, 48322, 48578, 48834, 49090, 32963, 33219, 33475, 33731, 33987, 34243, 34499, 34755, 35011, 35267, 35523, 35779, 36035, 36291, 36547, 36803, 37059, 37315, 37571, 37827, 38083, 38339, 38595, 38851, 39107, 39363, 39619, 39875, 40131, 40387, 40643, 40899, 41155, 41411, 41667, 41923, 42179, 42435, 42691, 42947, 43203, 43459, 43715, 43971, 44227, 44483, 44739, 44995, 45251, 45507, 45763, 46019, 46275, 46531, 46787, 47043, 47299, 47555, 47811, 48067, 48323, 48579, 48835, 49091, }; unsigned int CP1252toUTF8DataLen[128] = { 3, 2, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, }; #ifdef __BIG_ENDIAN__ BOOL initUTF8Done = FALSE; #else BOOL initUTF8Done = TRUE; #endif VOID initUTF8() { // Swap bytes of UTF-8 COde on Big-endian systems int n; char temp[4]; char rev[4]; if (initUTF8Done) return; for (n = 0; n <128; n++) { memcpy(temp, &CP437toUTF8Data[n], 4); rev[0] = temp[3]; rev[1] = temp[2]; rev[2] = temp[1]; rev[3] = temp[0]; memcpy(&CP437toUTF8Data[n], rev, 4); memcpy(temp, &CP1251toUTF8Data[n], 4); rev[0] = temp[3]; rev[1] = temp[2]; rev[2] = temp[1]; rev[3] = temp[0]; memcpy(&CP1251toUTF8Data[n], rev, 4); memcpy(temp, &CP1252toUTF8Data[n], 4); rev[0] = temp[3]; rev[1] = temp[2]; rev[2] = temp[1]; rev[3] = temp[0]; memcpy(&CP1252toUTF8Data[n], rev, 4); } initUTF8Done = TRUE; } int Is8Bit(unsigned char *cpt, int len) { int n; cpt--; for (n = 0; n < len; n++) { cpt++; if (*cpt > 127) return TRUE; } return FALSE; } int IsUTF8(unsigned char *ptr, int len) { int n; unsigned char * cpt = ptr; // This has to be a bit loose, as UTF8 sequences may split over packets memcpy(&ptr[len], "\x80\x80\x80", 3); // in case trailing bytes are in next packet // Don't check first 3 if could be part of sequence if ((*(cpt) & 0xC0) == 0x80) // Valid continuation { cpt++; len--; if ((*(cpt) & 0xC0) == 0x80) // Valid continuation { cpt++; len--; if ((*(cpt) & 0xC0) == 0x80) // Valid continuation { cpt++; len--; } } } cpt--; for (n = 0; n < len; n++) { cpt++; if (*cpt < 128) continue; if ((*cpt & 0xF8) == 0xF0) { // start of 4-byte sequence if (((*(cpt + 1) & 0xC0) == 0x80) && ((*(cpt + 2) & 0xC0) == 0x80) && ((*(cpt + 3) & 0xC0) == 0x80)) { cpt += 3; n += 3; continue; } return FALSE; } else if ((*cpt & 0xF0) == 0xE0) { // start of 3-byte sequence if (((*(cpt + 1) & 0xC0) == 0x80) && ((*(cpt + 2) & 0xC0) == 0x80)) { cpt += 2; n += 2; continue; } return FALSE; } else if ((*cpt & 0xE0) == 0xC0) { // start of 2-byte sequence if ((*(cpt + 1) & 0xC0) == 0x80) { cpt++; n++; continue; } return FALSE; } return FALSE; } return TRUE; } int WebIsUTF8(unsigned char *ptr, int len) { int n; unsigned char * cpt = ptr; // This is simpler than the Term version, as it only handles complete lines of text, so cant get split sequences cpt--; for (n = 0; n < len; n++) { cpt++; if (*cpt < 128) continue; if ((*cpt & 0xF8) == 0xF0) { // start of 4-byte sequence if (((*(cpt + 1) & 0xC0) == 0x80) && ((*(cpt + 2) & 0xC0) == 0x80) && ((*(cpt + 3) & 0xC0) == 0x80)) { cpt += 3; n += 3; continue; } return FALSE; } else if ((*cpt & 0xF0) == 0xE0) { // start of 3-byte sequence if (((*(cpt + 1) & 0xC0) == 0x80) && ((*(cpt + 2) & 0xC0) == 0x80)) { cpt += 2; n += 2; continue; } return FALSE; } else if ((*cpt & 0xE0) == 0xC0) { // start of 2-byte sequence if ((*(cpt + 1) & 0xC0) == 0x80) { cpt++; n++; continue; } return FALSE; } return FALSE; } return TRUE; } int Convert437toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF) { unsigned char * ptr1 = MsgPtr; unsigned char * ptr2 = UTF; int n; unsigned int c; for (n = 0; n < len; n++) { c = *(ptr1++); if (c < 128) { *(ptr2++) = c; continue; } memcpy(ptr2, &CP437toUTF8Data[c - 128], CP437toUTF8DataLen[c - 128]); ptr2 += CP437toUTF8DataLen[c - 128]; } return (int)(ptr2 - UTF); } int Convert1251toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF) { unsigned char * ptr1 = MsgPtr; unsigned char * ptr2 = UTF; int n; unsigned int c; for (n = 0; n < len; n++) { c = *(ptr1++); if (c < 128) { *(ptr2++) = c; continue; } memcpy(ptr2, &CP1251toUTF8Data[c - 128], CP1251toUTF8DataLen[c - 128]); ptr2 += CP1251toUTF8DataLen[c - 128]; } return (int)(ptr2 - UTF); } int Convert1252toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF) { unsigned char * ptr1 = MsgPtr; unsigned char * ptr2 = UTF; int n; unsigned int c; for (n = 0; n < len; n++) { c = *(ptr1++); if (c < 128) { *(ptr2++) = c; continue; } memcpy(ptr2, &CP1252toUTF8Data[c - 128], CP1252toUTF8DataLen[c - 128]); ptr2 += CP1252toUTF8DataLen[c - 128]; } return (int)(ptr2 - UTF); } int TrytoGuessCode(unsigned char * Char, int Len) { int Above127 = 0; int LineDraw = 0; int NumericAndSpaces = 0; int n; for (n = 0; n < Len; n++) { if (Char[n] < 65) { NumericAndSpaces++; } else { if (Char[n] > 127) { Above127++; if (Char[n] == 0xF8 || (Char[n] > 178 && Char[n] < 224)) { LineDraw++; } } } } if (Above127 == 0) // Doesn't really matter! return 1252; if (Above127 == LineDraw) return 437; // If only Line Draw chars, assume line draw // If mainly below 128, it is probably Latin if mainly above, probably Cyrillic if ((Len - (NumericAndSpaces + Above127)) < Above127) return 1251; else return 1252; } unsigned char outbuffer[16384]; // I don't think this needs to be thread safe int checkUTF8(unsigned char * in, int len, unsigned char * out) { // We mustn't mess with input string unsigned char Msg[8192]; int u, code = convUTF8; if (convUTF8 == -1 || !Is8Bit(in, len)) { // just copy to output memcpy(out, in, len); return len; } // Convert memcpy(Msg, in, len); Msg[len] = 0; if (convUTF8 == 0) // Auto - Try to guess encoding code = TrytoGuessCode(Msg, len); if (code == 437) u = Convert437toUTF8(Msg, len, out); else if (code == 1251) u = Convert1251toUTF8(Msg, len, out); else u = Convert1252toUTF8(Msg, len, out); return u; }