qttermtcp/utf8Routines.cpp

598 lines
12 KiB
C++

/*
Copyright 2001-2018 John Wiseman G8BPQ
This file is part of LinBPQ/BPQ32.
LinBPQ/BPQ32 is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
LinBPQ/BPQ32 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with LinBPQ/BPQ32. If not, see http://www.gnu.org/licenses
*/
// Routines to convert to and from UTF8
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
#define _CRT_SECURE_NO_DEPRECATE
#ifndef WIN32
#define VOID void
#define BOOL int
#define TRUE 1
#define FALSE 0
#include <string.h>
#else
#include <windows.h>
#endif
int convUTF8 = 0;
unsigned int CP437toUTF8Data[128] = {
34755, 48323, 43459, 41667,
42179, 41155, 42435, 42947,
43715, 43971, 43203, 44995,
44739, 44227, 33987, 34243,
35267, 42691, 34499, 46275, // 90
46787, 45763, 48067, 47555,
49091, 38595, 40131, 41666,
41922, 42434, 10978018, 37574,
41411, 44483, 46019, 47811, // A0
45507, 37315, 43714, 47810,
49090, 9473250, 44226, 48578,
48322, 41410, 43970, 48066,
9541346, 9606882, 9672418, 8557794, //B0
10786018, 10589666, 10655202, 9868770,
9803234, 10720738, 9541090, 9934306,
10327522, 10261986, 10196450, 9475298,
9737442, 11834594, 11310306, 10261730, //C0
8426722, 12358882, 10393058, 10458594,
10130914, 9737698, 11113954, 10917346,
10524130, 9475554, 11310562, 10982882,
11048418, 10786274, 10851810, 10065378, //D0
9999842, 9606626, 9672162, 11245026,
11179490, 9999586, 9213154, 8951522,
8689378, 9213666, 9475810, 8427234,
45518, 40899, 37838, 32975, // E0
41934, 33743, 46530, 33999,
42702, 39118, 43470, 46286,
10389730, 34511, 46542, 11110626,
10586594, 45506, 10848738, 10783202, // F0
10521826, 10587362, 47043, 8948194,
45250, 10062050, 47042, 10127586,
12550626, 45762, 10524386, 41154,
};
unsigned int CP437toUTF8DataLen[128] = {
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 3, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 3, 2, 2,
2, 2, 2, 2,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
3, 3, 3, 3,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
3, 2, 2, 3,
3, 2, 3, 3,
3, 3, 2, 3,
2, 3, 2, 3,
3, 2, 3, 2,
};
unsigned int CP1251toUTF8Data[128] = {
33488, 33744, 10125538, 37841,
10387682, 10911970, 10518754, 10584290,
11305698, 11567330, 35280, 12157154,
35536, 36048, 35792, 36816,
37585, 9994466, 10060002, 10256610,
10322146, 10649826, 9666786, 9732322,
39106, 10650850, 39377, 12222690,
39633, 40145, 39889, 40913,
41154, 36560, 40657, 35024,
42178, 37074, 42690, 42946,
33232, 43458, 34000, 43970,
44226, 44482, 44738, 34768,
45250, 45506, 34512, 38609,
37330, 46530, 46786, 47042,
37329, 9864418, 38097, 48066,
39121, 34256, 38353, 38865,
37072, 37328, 37584, 37840,
38096, 38352, 38608, 38864,
39120, 39376, 39632, 39888,
40144, 40400, 40656, 40912,
41168, 41424, 41680, 41936,
42192, 42448, 42704, 42960,
43216, 43472, 43728, 43984,
44240, 44496, 44752, 45008,
45264, 45520, 45776, 46032,
46288, 46544, 46800, 47056,
47312, 47568, 47824, 48080,
48336, 48592, 48848, 49104,
32977, 33233, 33489, 33745,
34001, 34257, 34513, 34769,
35025, 35281, 35537, 35793,
36049, 36305, 36561, 36817,
};
unsigned int CP1251toUTF8DataLen[128] = {
2, 2, 3, 2,
3, 3, 3, 3,
3, 3, 2, 3,
2, 2, 2, 2,
2, 3, 3, 3,
3, 3, 3, 3,
2, 3, 2, 3,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 3, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
};
unsigned int CP1252toUTF8Data[128] = {
11305698, 33218, 10125538, 37574,
10387682, 10911970, 10518754, 10584290,
34507, 11567330, 41157, 12157154,
37573, 36290, 48581, 36802,
37058, 9994466, 10060002, 10256610,
10322146, 10649826, 9666786, 9732322,
40139, 10650850, 41413, 12222690,
37829, 40386, 48837, 47301,
41154, 41410, 41666, 41922,
42178, 42434, 42690, 42946,
43202, 43458, 43714, 43970,
44226, 44482, 44738, 44994,
45250, 45506, 45762, 46018,
46274, 46530, 46786, 47042,
47298, 47554, 47810, 48066,
48322, 48578, 48834, 49090,
32963, 33219, 33475, 33731,
33987, 34243, 34499, 34755,
35011, 35267, 35523, 35779,
36035, 36291, 36547, 36803,
37059, 37315, 37571, 37827,
38083, 38339, 38595, 38851,
39107, 39363, 39619, 39875,
40131, 40387, 40643, 40899,
41155, 41411, 41667, 41923,
42179, 42435, 42691, 42947,
43203, 43459, 43715, 43971,
44227, 44483, 44739, 44995,
45251, 45507, 45763, 46019,
46275, 46531, 46787, 47043,
47299, 47555, 47811, 48067,
48323, 48579, 48835, 49091,
};
unsigned int CP1252toUTF8DataLen[128] = {
3, 2, 3, 2,
3, 3, 3, 3,
2, 3, 2, 3,
2, 2, 2, 2,
2, 3, 3, 3,
3, 3, 3, 3,
2, 3, 2, 3,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
2, 2, 2, 2,
};
#ifdef __BIG_ENDIAN__
BOOL initUTF8Done = FALSE;
#else
BOOL initUTF8Done = TRUE;
#endif
VOID initUTF8()
{
// Swap bytes of UTF-8 COde on Big-endian systems
int n;
char temp[4];
char rev[4];
if (initUTF8Done)
return;
for (n = 0; n <128; n++)
{
memcpy(temp, &CP437toUTF8Data[n], 4);
rev[0] = temp[3];
rev[1] = temp[2];
rev[2] = temp[1];
rev[3] = temp[0];
memcpy(&CP437toUTF8Data[n], rev, 4);
memcpy(temp, &CP1251toUTF8Data[n], 4);
rev[0] = temp[3];
rev[1] = temp[2];
rev[2] = temp[1];
rev[3] = temp[0];
memcpy(&CP1251toUTF8Data[n], rev, 4);
memcpy(temp, &CP1252toUTF8Data[n], 4);
rev[0] = temp[3];
rev[1] = temp[2];
rev[2] = temp[1];
rev[3] = temp[0];
memcpy(&CP1252toUTF8Data[n], rev, 4);
}
initUTF8Done = TRUE;
}
int Is8Bit(unsigned char *cpt, int len)
{
int n;
cpt--;
for (n = 0; n < len; n++)
{
cpt++;
if (*cpt > 127)
return TRUE;
}
return FALSE;
}
int IsUTF8(unsigned char *ptr, int len)
{
int n;
unsigned char * cpt = ptr;
// This has to be a bit loose, as UTF8 sequences may split over packets
memcpy(&ptr[len], "\x80\x80\x80", 3); // in case trailing bytes are in next packet
// Don't check first 3 if could be part of sequence
if ((*(cpt) & 0xC0) == 0x80) // Valid continuation
{
cpt++;
len--;
if ((*(cpt) & 0xC0) == 0x80) // Valid continuation
{
cpt++;
len--;
if ((*(cpt) & 0xC0) == 0x80) // Valid continuation
{
cpt++;
len--;
}
}
}
cpt--;
for (n = 0; n < len; n++)
{
cpt++;
if (*cpt < 128)
continue;
if ((*cpt & 0xF8) == 0xF0)
{ // start of 4-byte sequence
if (((*(cpt + 1) & 0xC0) == 0x80)
&& ((*(cpt + 2) & 0xC0) == 0x80)
&& ((*(cpt + 3) & 0xC0) == 0x80))
{
cpt += 3;
n += 3;
continue;
}
return FALSE;
}
else if ((*cpt & 0xF0) == 0xE0)
{ // start of 3-byte sequence
if (((*(cpt + 1) & 0xC0) == 0x80)
&& ((*(cpt + 2) & 0xC0) == 0x80))
{
cpt += 2;
n += 2;
continue;
}
return FALSE;
}
else if ((*cpt & 0xE0) == 0xC0)
{ // start of 2-byte sequence
if ((*(cpt + 1) & 0xC0) == 0x80)
{
cpt++;
n++;
continue;
}
return FALSE;
}
return FALSE;
}
return TRUE;
}
int WebIsUTF8(unsigned char *ptr, int len)
{
int n;
unsigned char * cpt = ptr;
// This is simpler than the Term version, as it only handles complete lines of text, so cant get split sequences
cpt--;
for (n = 0; n < len; n++)
{
cpt++;
if (*cpt < 128)
continue;
if ((*cpt & 0xF8) == 0xF0)
{ // start of 4-byte sequence
if (((*(cpt + 1) & 0xC0) == 0x80)
&& ((*(cpt + 2) & 0xC0) == 0x80)
&& ((*(cpt + 3) & 0xC0) == 0x80))
{
cpt += 3;
n += 3;
continue;
}
return FALSE;
}
else if ((*cpt & 0xF0) == 0xE0)
{ // start of 3-byte sequence
if (((*(cpt + 1) & 0xC0) == 0x80)
&& ((*(cpt + 2) & 0xC0) == 0x80))
{
cpt += 2;
n += 2;
continue;
}
return FALSE;
}
else if ((*cpt & 0xE0) == 0xC0)
{ // start of 2-byte sequence
if ((*(cpt + 1) & 0xC0) == 0x80)
{
cpt++;
n++;
continue;
}
return FALSE;
}
return FALSE;
}
return TRUE;
}
int Convert437toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF)
{
unsigned char * ptr1 = MsgPtr;
unsigned char * ptr2 = UTF;
int n;
unsigned int c;
for (n = 0; n < len; n++)
{
c = *(ptr1++);
if (c < 128)
{
*(ptr2++) = c;
continue;
}
memcpy(ptr2, &CP437toUTF8Data[c - 128], CP437toUTF8DataLen[c - 128]);
ptr2 += CP437toUTF8DataLen[c - 128];
}
return (int)(ptr2 - UTF);
}
int Convert1251toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF)
{
unsigned char * ptr1 = MsgPtr;
unsigned char * ptr2 = UTF;
int n;
unsigned int c;
for (n = 0; n < len; n++)
{
c = *(ptr1++);
if (c < 128)
{
*(ptr2++) = c;
continue;
}
memcpy(ptr2, &CP1251toUTF8Data[c - 128], CP1251toUTF8DataLen[c - 128]);
ptr2 += CP1251toUTF8DataLen[c - 128];
}
return (int)(ptr2 - UTF);
}
int Convert1252toUTF8(unsigned char * MsgPtr, int len, unsigned char * UTF)
{
unsigned char * ptr1 = MsgPtr;
unsigned char * ptr2 = UTF;
int n;
unsigned int c;
for (n = 0; n < len; n++)
{
c = *(ptr1++);
if (c < 128)
{
*(ptr2++) = c;
continue;
}
memcpy(ptr2, &CP1252toUTF8Data[c - 128], CP1252toUTF8DataLen[c - 128]);
ptr2 += CP1252toUTF8DataLen[c - 128];
}
return (int)(ptr2 - UTF);
}
int TrytoGuessCode(unsigned char * Char, int Len)
{
int Above127 = 0;
int LineDraw = 0;
int NumericAndSpaces = 0;
int n;
for (n = 0; n < Len; n++)
{
if (Char[n] < 65)
{
NumericAndSpaces++;
}
else
{
if (Char[n] > 127)
{
Above127++;
if (Char[n] == 0xF8 || (Char[n] > 178 && Char[n] < 224))
{
LineDraw++;
}
}
}
}
if (Above127 == 0) // Doesn't really matter!
return 1252;
if (Above127 == LineDraw)
return 437; // If only Line Draw chars, assume line draw
// If mainly below 128, it is probably Latin if mainly above, probably Cyrillic
if ((Len - (NumericAndSpaces + Above127)) < Above127)
return 1251;
else
return 1252;
}
unsigned char outbuffer[16384]; // I don't think this needs to be thread safe
int checkUTF8(unsigned char * in, int len, unsigned char * out)
{
// We mustn't mess with input string
unsigned char Msg[8192];
int u, code = convUTF8;
if (convUTF8 == -1 || !Is8Bit(in, len))
{
// just copy to output
memcpy(out, in, len);
return len;
}
// Convert
memcpy(Msg, in, len);
Msg[len] = 0;
if (convUTF8 == 0) // Auto - Try to guess encoding
code = TrytoGuessCode(Msg, len);
if (code == 437)
u = Convert437toUTF8(Msg, len, out);
else if (code == 1251)
u = Convert1251toUTF8(Msg, len, out);
else
u = Convert1252toUTF8(Msg, len, out);
return u;
}