1841 lines
50 KiB
C++
1841 lines
50 KiB
C++
///////////////////////////////////////////////////////////////////////////////
|
|
// Copyright (c) Electronic Arts Inc. All rights reserved.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
#include <EAStdC/internal/Config.h>
|
|
#include <EAStdC/EATextUtil.h>
|
|
#include <EAStdC/EAString.h>
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
// EATEXTUTIL_MIN / EATEXTUTIL_MAX
|
|
//
|
|
#define EATEXTUTIL_MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
#define EATEXTUTIL_MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
|
|
|
|
namespace EA
|
|
{
|
|
namespace StdC
|
|
{
|
|
|
|
|
|
extern uint8_t utf8lengthTable[256];
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// UTF8Validate
|
|
//
|
|
// There are multiple definitions of what a valid UTF8 string is. UTF8 allows
|
|
// the ability to encode the same UTF16 character in multiple ways. This in
|
|
// one sense is a legal UTF8 array. However, for some security reasons it is
|
|
// sometimes considered that a UTF8 array is illegal (or at least 'unsafe')
|
|
// if it encodes some character with more bytes than needed. Actually the
|
|
// Unicode standard v3.0 says that these 'insecure' UTF8 sequences are
|
|
// formally illegal to generate but not illegal to interpret.
|
|
// See "http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html"
|
|
//
|
|
// We take the high-security approach here, though it is slower. We could write
|
|
// a simpler function that does a non-security check with the simple table
|
|
// of info here:
|
|
// 0x00-0x7f are single standalone bytes.
|
|
// 0xc2-0xFD are first byte of a multi-byte sequence.
|
|
// 0xc2-0xdf are first byte of a pair.
|
|
// 0xe0-0xef are first byte of a triplet.
|
|
// 0x00-0xf7 are first byte of a quadruplet.
|
|
// 0xf8-0xfb are first byte of a 5-tuplet.
|
|
// 0xfc-0xfd are first byte of a 6-tuplet.
|
|
// 0xfe-0xff are invalid bytes anywhere in a UTF8 string.
|
|
// 0x80-0xbf are the second-sixth byte of a multi-byte sequence, though not all values are valid for all such bytes.
|
|
//
|
|
// See 'http://www.cl.cam.ac.uk/~mgk25/unicode.html' or search for "UTF8 FAQ"
|
|
// on the Internet for more details on UTF8 and Unicode.
|
|
//
|
|
EASTDC_API bool UTF8Validate(const char* pText, size_t nLength)
|
|
{
|
|
const uint8_t* pSource8 = (const uint8_t*)pText;
|
|
const uint8_t* const pSource8End = pSource8 + nLength;
|
|
|
|
while(pSource8 < pSource8End)
|
|
{
|
|
if(pSource8[0] < 0x80)
|
|
++pSource8;
|
|
else if(pSource8[0] < 0xC2)
|
|
break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
else if(pSource8[0] < 0xE0) // If 2 input chars result in 1 output char...
|
|
{
|
|
if(pSource8End - pSource8 >= 2)
|
|
{
|
|
if(!((pSource8[1] ^ 0x80) < 0x40))
|
|
break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
pSource8 += 2;
|
|
}
|
|
else
|
|
break; //The input string is not long enough to finish reading the current character.
|
|
}
|
|
else if(pSource8[0] < 0xF0) // If 3 input chars result in 1 output char...
|
|
{
|
|
if((pSource8End - pSource8) >= 3)
|
|
{
|
|
if(!(((pSource8[1] ^ 0x80) < 0x40) &&
|
|
((pSource8[2] ^ 0x80) < 0x40) &&
|
|
(pSource8[0] >= 0xE1 || pSource8[1] >= 0xA0)))
|
|
break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
pSource8 += 3;
|
|
}
|
|
else
|
|
break; //The input string is not long enough to finish reading the current character.
|
|
}
|
|
else if(pSource8[0] < 0xF8) // If 4 input chars result in 1 output char...
|
|
{
|
|
if((pSource8End - pSource8) >= 4)
|
|
{
|
|
if(!(((pSource8[1] ^ 0x80) < 0x40) &&
|
|
((pSource8[2] ^ 0x80) < 0x40) &&
|
|
((pSource8[3] ^ 0x80) < 0x40) &&
|
|
(pSource8[0] >= 0xF1 || pSource8[1] >= 0x90)))
|
|
break; // The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
pSource8 += 4;
|
|
}
|
|
else
|
|
break; //The input string is not long enough to finish reading the current character.
|
|
}
|
|
else if(pSource8[0] < 0xFC) // If 5 input chars result in 1 output char...
|
|
{
|
|
if((pSource8End - pSource8) >= 5)
|
|
{
|
|
if(!(((pSource8[1] ^ 0x80) < 0x40) &&
|
|
((pSource8[2] ^ 0x80) < 0x40) &&
|
|
((pSource8[3] ^ 0x80) < 0x40) &&
|
|
((pSource8[4] ^ 0x80) < 0x40) &&
|
|
(pSource8[0] >= 0xf9 || pSource8[1] >= 0x88)))
|
|
break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
pSource8 += 5;
|
|
}
|
|
else
|
|
break; //The input string is not long enough to finish reading the current character.
|
|
}
|
|
else if(pSource8[0] < 0xFE) // If 6 input chars result in 1 output char...
|
|
{
|
|
if((pSource8End - pSource8) >= 6)
|
|
{
|
|
if(!(((pSource8[1] ^ 0x80) < 0x40) &&
|
|
((pSource8[2] ^ 0x80) < 0x40) &&
|
|
((pSource8[3] ^ 0x80) < 0x40) &&
|
|
((pSource8[4] ^ 0x80) < 0x40) &&
|
|
((pSource8[5] ^ 0x80) < 0x40) &&
|
|
(pSource8[0] >= 0xfd || pSource8[1] >= 0x84)))
|
|
break; //The character is invalid. It is important that we check for this because various security issues potentially arise if we don't.
|
|
pSource8 += 6;
|
|
}
|
|
else
|
|
break; //The input string is not long enough to finish reading the current character.
|
|
}
|
|
else //Else the current input char is invalid.
|
|
break;
|
|
}
|
|
|
|
return (pSource8 == pSource8End); // The return value is OK if we successfully processed all characters.
|
|
}
|
|
|
|
|
|
// Returns the pointer p incremented by n multibyte characters.
|
|
// The string must be a valid UTF8 string or else the behavior is undefined.
|
|
// If the string is not known to be valid, then it should be first validated independently
|
|
// or a validating version of this function should be used instead.
|
|
EASTDC_API char* UTF8Increment(const char* p, size_t n)
|
|
{
|
|
while(n--)
|
|
{
|
|
// To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
|
|
|
|
const int c = (uint8_t)*p;
|
|
|
|
if (c <= 0xc1) // Actually, any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
|
|
p += 1;
|
|
else if(c <= 0xdf)
|
|
p += 2;
|
|
else if(c <= 0xef)
|
|
p += 3;
|
|
else if(c <= 0xf7)
|
|
p += 4;
|
|
else if(c <= 0xfb)
|
|
p += 5;
|
|
else if(c <= 0xfd)
|
|
p += 6;
|
|
else
|
|
p += 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
|
|
}
|
|
|
|
return (char*)p;
|
|
}
|
|
|
|
|
|
// Returns the pointer p decremented by n multibyte characters.
|
|
// The string must be decrementable by the given number of characters or else
|
|
// the behavior becomes undefined.
|
|
// The string must be a valid UTF8 string or else the behavior is undefined.
|
|
// If the string is not known to be valid, then it should be first validated independently
|
|
// or a validating version of this function should be used instead.
|
|
EASTDC_API char* UTF8Decrement(const char* p, size_t n)
|
|
{
|
|
while(n)
|
|
{
|
|
if(!UTF8IsFollowByte(*--p))
|
|
--n;
|
|
}
|
|
|
|
return (char*)p;
|
|
}
|
|
|
|
|
|
// Returns number of Unicode characters are in the UTF8-encoded string.
|
|
// Return value will be <= Strlen(pString).
|
|
// The string p must be 0-terminated or the behavior of this function is undefined.
|
|
// The string must be a valid UTF8 string or else the behavior is undefined.
|
|
// If the string is not known to be valid, then it should be first validated independently
|
|
// or a validating version of this function should be used instead.
|
|
EASTDC_API size_t UTF8Length(const char* p)
|
|
{
|
|
size_t n = 0;
|
|
|
|
while(*p)
|
|
{
|
|
if((*p & 0xc0) != 0x80) // If this is a leading char...
|
|
++n;
|
|
++p;
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
|
|
// Returns number of characters that would be in a UTF8-encoded string.
|
|
// Return value will be >= Strlen(pString).
|
|
// The string p must be 0-terminated or the behavior of this function is undefined.
|
|
EASTDC_API size_t UTF8Length(const char16_t* p)
|
|
{
|
|
size_t n = 0;
|
|
uint32_t c;
|
|
|
|
while((c = *p++) != 0)
|
|
{
|
|
if(c < 0x00000080)
|
|
n += 1;
|
|
else if(c < 0x00000800)
|
|
n += 2;
|
|
else // if(c < 0x00010000)
|
|
n += 3;
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
|
|
// Returns number of characters that would be in a UTF8-encoded string.
|
|
// Return value will be >= Strlen(pString).
|
|
// The string p must be 0-terminated or the behavior of this function is undefined.
|
|
// Assumes the input values are valid, else the return value will be wrong.
|
|
EASTDC_API size_t UTF8Length(const char32_t* p)
|
|
{
|
|
size_t n = 0;
|
|
uint32_t c;
|
|
|
|
while((c = (uint32_t)*p++) != 0)
|
|
{
|
|
if(c < 0x00000080)
|
|
n += 1;
|
|
else if(c < 0x00000800)
|
|
n += 2;
|
|
else if(c < 0x00010000)
|
|
n += 3;
|
|
else if(c < 0x00200000)
|
|
n += 4;
|
|
else if(c < 0x04000000)
|
|
n += 5;
|
|
else if(c <= 0x7fffffff)
|
|
n += 6;
|
|
else
|
|
n += 1; // Error
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// UTF8CharSize
|
|
//
|
|
// Returns the byte length of the UTF8 multibyte char pointed to by p.
|
|
// The input p must point to the beginning of a UTF8 multibyte sequence,
|
|
// else the return value is 1.
|
|
//
|
|
// 0x00-0x80 are single bytes.
|
|
// 0x81-0xc1 are invalid values for a leading UTF8 char.
|
|
// 0xc2-0xdf are first byte of a pair.
|
|
// 0xe0-0xef are first byte of a triplet.
|
|
// 0xf0-0xf7 are first byte of a quadruplet.
|
|
// 0xf8-0xfb are first byte of a 5-tuplet.
|
|
// 0xfc-0xfd are first byte of a 6-tuplet.
|
|
// 0xfe-0xff are invalid values for a leading UTF8 char.
|
|
//
|
|
EASTDC_API size_t UTF8CharSize(const char* p)
|
|
{
|
|
// To do: Change this code to instead use the utf8lengthTable fropm EAString.cpp
|
|
|
|
const int c = (uint8_t)*p;
|
|
|
|
if (c <= 0xc1) // Any value greater than 0x80 and less than 0xc2 is an invalid leading UTF8 char.
|
|
return 1;
|
|
else if(c <= 0xdf)
|
|
return 2;
|
|
else if(c <= 0xef)
|
|
return 3;
|
|
else if(c <= 0xf7) // This refers to a unicode point > char16_t
|
|
return 4;
|
|
else if(c <= 0xfb) // This refers to a unicode point > char16_t
|
|
return 5;
|
|
else if(c <= 0xfd) // This refers to a unicode point > char16_t
|
|
return 6;
|
|
|
|
return 1; // Error. We return 1 instead of 0 or -1 because the user is probably iterating a string and so this is safer.
|
|
}
|
|
|
|
|
|
EASTDC_API size_t UTF8CharSize(char16_t c)
|
|
{
|
|
if(c < 0x00000080)
|
|
return 1;
|
|
else if(c < 0x00000800)
|
|
return 2;
|
|
else // if(c < 0x00010000)
|
|
return 3;
|
|
|
|
// The following would be used if the input was 32 bit instead of 16 bit.
|
|
//else if(c < 0x00010000)
|
|
// return 3;
|
|
//else if(c < 0x00200000)
|
|
// return 4;
|
|
//else if(c < 0x04000000)
|
|
// return 5;
|
|
//else if(c <= 0x7fffffff)
|
|
// return 6;
|
|
//
|
|
//return 1; // Error
|
|
}
|
|
|
|
|
|
EASTDC_API size_t UTF8CharSize(char32_t c)
|
|
{
|
|
if((uint32_t)c < 0x00000080)
|
|
return 1;
|
|
else if((uint32_t)c < 0x00000800)
|
|
return 2;
|
|
else if((uint32_t)c < 0x00010000)
|
|
return 3;
|
|
else if((uint32_t)c < 0x00200000)
|
|
return 4;
|
|
else if((uint32_t)c < 0x04000000)
|
|
return 5;
|
|
else if((uint32_t)c < 0x80000000)
|
|
return 6;
|
|
|
|
return 1; // Error
|
|
}
|
|
|
|
|
|
EASTDC_API char16_t UTF8ReadChar(const char* p, const char** ppEnd)
|
|
{
|
|
char16_t c = 0;
|
|
const char* pCurrent;
|
|
uint8_t cChar0((uint8_t)*p), cChar1, cChar2, cChar3;
|
|
|
|
//assert((cChar0 != 0xFE) && (cChar0 != 0xFF)); // No byte can contain 0xFE or 0xFF
|
|
|
|
if(cChar0 < 0x80)
|
|
{
|
|
c = cChar0;
|
|
pCurrent = p + 1;
|
|
}
|
|
else
|
|
{
|
|
//assert((cChar0 & 0xC0) == 0xC0); // The top two bits need to be equal to 1
|
|
|
|
if((cChar0 & 0xE0) == 0xC0)
|
|
{
|
|
c = (char16_t)((cChar0 & 0x1F) << 6);
|
|
|
|
cChar1 = static_cast<uint8_t>(p[1]);
|
|
//assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= cChar1 & 0x3F;
|
|
|
|
//assert(c >= 0x0080 && c < 0x0800); // Check that we have the smallest coding
|
|
|
|
pCurrent = p + 2;
|
|
}
|
|
else if((cChar0 & 0xF0) == 0xE0)
|
|
{
|
|
c = (char16_t)((cChar0 & 0xF) << 12);
|
|
|
|
cChar1 = static_cast<uint8_t>(p[1]);
|
|
//assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= (cChar1 & 0x3F) << 6;
|
|
|
|
cChar2 = static_cast<uint8_t>(p[2]);
|
|
//assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= cChar2 & 0x3F;
|
|
|
|
//assert(c >= 0x00000800 && c < 0x00010000); // Check that we have the smallest coding
|
|
|
|
pCurrent = p + 3;
|
|
}
|
|
else
|
|
{
|
|
//assert((cChar0 & 0xf8) == 0xf0); // We handle the unicode but not UCS-4
|
|
c = (char16_t)((cChar0 & 0x7) << 18);
|
|
|
|
cChar1 = static_cast<uint8_t>(p[1]);
|
|
//assert((cChar1 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= (char16_t)((cChar1 & 0x3F) << 12);
|
|
|
|
cChar2 = static_cast<uint8_t>(p[2]);
|
|
//assert((cChar2 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= (cChar2 & 0x3F) << 6;
|
|
|
|
cChar3 = static_cast<uint8_t>(p[3]);
|
|
//assert((cChar3 & 0xC0) == 0x80); // All subsequent code should be b10xxxxxx
|
|
c |= cChar3 & 0x3F;
|
|
|
|
//assert(c >= 0x00010000 && c <= 0x0010FFFF); // Check that we have the smallest coding, Unicode and not ucs-4
|
|
|
|
pCurrent = p + 4;
|
|
}
|
|
}
|
|
|
|
if(ppEnd)
|
|
*ppEnd = pCurrent;
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
// This function assumes that there is enough space at p to write the char.
|
|
// At most three bytes are needed to write a char16_t value and 6 bytes are
|
|
// needed to write a char32_t value.
|
|
EASTDC_API char* UTF8WriteChar(char* p, char16_t c)
|
|
{
|
|
if(c < 0x80)
|
|
{
|
|
*p++ = (char)(uint8_t)c;
|
|
}
|
|
else if(c < 0x0800)
|
|
{
|
|
*p++ = (char)(uint8_t)((c >> 6) | 0xC0);
|
|
*p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
}
|
|
else // if(c < 0x00010000)
|
|
{
|
|
*p++ = (char)(uint8_t)((c >> 12) | 0xE0);
|
|
*p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
|
|
*p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
}
|
|
//else
|
|
//{
|
|
// *p++ = (char)(uint8_t)((c >> 18) | 0xF0);
|
|
// *p++ = (char)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
|
|
// *p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
|
|
// *p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
//}
|
|
|
|
return p;
|
|
}
|
|
|
|
// This function assumes that there is enough space at p to write the char.
|
|
// At most three bytes are needed to write a char32_t value and 6 bytes are
|
|
// needed to write a char32_t value.
|
|
EASTDC_API char* UTF8WriteChar(char* p, char32_t c)
|
|
{
|
|
if((uint32_t)c < 0x80)
|
|
{
|
|
*p++ = (char)(uint8_t)c;
|
|
}
|
|
else if((uint32_t)c < 0x0800)
|
|
{
|
|
*p++ = (char)(uint8_t)((c >> 6) | 0xC0);
|
|
*p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
}
|
|
else if((uint32_t)c < 0x00010000)
|
|
{
|
|
*p++ = (char)(uint8_t)((c >> 12) | 0xE0);
|
|
*p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
|
|
*p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
}
|
|
else
|
|
{
|
|
*p++ = (char)(uint8_t)((c >> 18) | 0xF0);
|
|
*p++ = (char)(uint8_t)(((c >> 12) & 0x3F) | 0x80);
|
|
*p++ = (char)(uint8_t)(((c >> 6) & 0x3F) | 0x80);
|
|
*p++ = (char)(uint8_t)((c & 0x3F) | 0x80);
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
|
|
/// UTF8TrimPartialChar
|
|
///
|
|
/// Trim the string to the last valid UTF8 character. This function has no effect on a UTF8 string that has
|
|
/// entirely valid UTF8 content. It only trims the string if there is an incomplete UTF8 sequence at the
|
|
/// end. The resulting string will always be a valid UTF8 string, whereas the input string may not be.
|
|
/// Returns the strlen of the trimmed string.
|
|
size_t UTF8TrimPartialChar(char* pString, size_t nLength)
|
|
{
|
|
size_t validPos = 0;
|
|
|
|
while(validPos < nLength)
|
|
{
|
|
uint8_t ch = (uint8_t)pString[validPos];
|
|
size_t length = utf8lengthTable[ch];
|
|
|
|
// length = 0 means invalid UTF8 marker
|
|
if((length == 0) || ((validPos + length) > nLength))
|
|
break;
|
|
else
|
|
validPos += length;
|
|
}
|
|
|
|
pString[validPos] = 0;
|
|
return validPos;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// UTF8ReplaceInvalidChar
|
|
//
|
|
// This function replaces all invalidate UTF8 characters with the user provided
|
|
// 8-bit replacement. The returned character array is guaranteed null-terminated.
|
|
//
|
|
EASTDC_API char* UTF8ReplaceInvalidChar(const char* pIn, size_t nLength, char* pOut, char replaceWith)
|
|
{
|
|
size_t validPos = 0;
|
|
|
|
while(validPos < nLength)
|
|
{
|
|
uint8_t ch = (uint8_t)pIn[validPos];
|
|
size_t length = utf8lengthTable[ch];
|
|
|
|
// length = 0 means invalid UTF8 marker
|
|
if((length == 0) || ((validPos + length) > nLength))
|
|
{
|
|
pOut[validPos++] = replaceWith;
|
|
}
|
|
else
|
|
{
|
|
for(auto i = validPos; i < validPos + length; i++)
|
|
pOut[i] = pIn[i];
|
|
|
|
validPos += length;
|
|
}
|
|
}
|
|
|
|
pOut[validPos] = 0;
|
|
return pOut + validPos;
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// MatchPattern
|
|
//
|
|
// This function is recursively called on substrings.
|
|
// Used by the WildcardMatch function.
|
|
//
|
|
template <class CharT>
|
|
bool MatchPattern(const CharT* pElement, const CharT* pPattern)
|
|
{
|
|
if((*pPattern == (CharT)'*') && !pPattern[1])
|
|
return true; // The pattern is set to match everything, so return true.
|
|
else if(!*pElement && *pPattern)
|
|
return false; // The element is empty but the pattern is not, so return false.
|
|
else if(!*pElement)
|
|
return true; // The element and pattern are both empty, so we are done. Return true.
|
|
else
|
|
{
|
|
if(*pPattern == (CharT)'*')
|
|
{
|
|
if(MatchPattern(pElement, pPattern+1)) // What this section does is try to match source segments to
|
|
return true; // the '*' portion of the pattern. As many parts of the source that
|
|
else // can be assigned to the '*' portion of the pattern are done. If
|
|
return MatchPattern(pElement+1, pPattern); // not possible, we pop out of the whole thing.
|
|
}
|
|
else if(*pPattern == (CharT)'?')
|
|
return MatchPattern(pElement+1, pPattern+1); // The pattern accepts any character here, so move onto the next character.
|
|
else
|
|
{
|
|
if(*pElement == *pPattern)
|
|
return MatchPattern(pElement+1, pPattern+1); // The current element and pattern chars match, so move onto next character.
|
|
else
|
|
return false; // The current element char simply doesn't match the pattern char, so return false.
|
|
}
|
|
}
|
|
// return true; // This should never get executed, but some compilers might not be smart enough to realize it.
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// WildcardMatch
|
|
//
|
|
// We go through extra effort below to avoid doing memory allocation in most cases.
|
|
//
|
|
EASTDC_API bool WildcardMatch(const char* pString, const char* pPattern, bool bCaseSensitive)
|
|
{
|
|
if(bCaseSensitive)
|
|
return MatchPattern(pString, pPattern);
|
|
else
|
|
{
|
|
// Do efficient string conversion to lower case...
|
|
char pStringLBuffer[384];
|
|
char* pStringL;
|
|
char* pStringLAllocated;
|
|
size_t nStringLLength = Strlen(pString);
|
|
|
|
if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
|
|
{
|
|
pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char[]") char[nStringLLength + 1];
|
|
pStringL = pStringLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pStringLAllocated = NULL;
|
|
pStringL = pStringLBuffer;
|
|
}
|
|
Strcpy(pStringL, pString);
|
|
Strlwr(pStringL);
|
|
|
|
// Do efficient pattern conversion to lower case...
|
|
char pPatternLBuffer[32];
|
|
char* pPatternL;
|
|
char* pPatternLAllocated;
|
|
size_t nPatternLLength = Strlen(pPattern);
|
|
|
|
if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
|
|
{
|
|
pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char[]") char[nPatternLLength + 1];
|
|
pPatternL = pPatternLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pPatternLAllocated = NULL;
|
|
pPatternL = pPatternLBuffer;
|
|
}
|
|
Strcpy(pPatternL, pPattern);
|
|
Strlwr(pPatternL);
|
|
|
|
const bool bResult = MatchPattern(pStringL, pPatternL);
|
|
|
|
delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
|
|
delete[] pPatternLAllocated;
|
|
|
|
return bResult;
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// WildcardMatch
|
|
//
|
|
// We go through extra effort below to avoid doing memory allocation in most cases.
|
|
//
|
|
EASTDC_API bool WildcardMatch(const char16_t* pString, const char16_t* pPattern, bool bCaseSensitive)
|
|
{
|
|
if(bCaseSensitive)
|
|
return MatchPattern(pString, pPattern);
|
|
else
|
|
{
|
|
// Do efficient string conversion to lower case...
|
|
char16_t pStringLBuffer[384];
|
|
char16_t* pStringL;
|
|
char16_t* pStringLAllocated;
|
|
size_t nStringLLength = Strlen(pString);
|
|
|
|
if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
|
|
{
|
|
pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char16[]") char16_t[nStringLLength + 1];
|
|
pStringL = pStringLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pStringLAllocated = NULL;
|
|
pStringL = pStringLBuffer;
|
|
}
|
|
Strcpy(pStringL, pString);
|
|
Strlwr(pStringL);
|
|
|
|
// Do efficient pattern conversion to lower case...
|
|
char16_t pPatternLBuffer[32];
|
|
char16_t* pPatternL;
|
|
char16_t* pPatternLAllocated;
|
|
size_t nPatternLLength = Strlen(pPattern);
|
|
|
|
if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
|
|
{
|
|
pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char16[]") char16_t[nPatternLLength + 1];
|
|
pPatternL = pPatternLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pPatternLAllocated = NULL;
|
|
pPatternL = pPatternLBuffer;
|
|
}
|
|
Strcpy(pPatternL, pPattern);
|
|
Strlwr(pPatternL);
|
|
|
|
const bool bResult = MatchPattern(pStringL, pPatternL);
|
|
|
|
delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
|
|
delete[] pPatternLAllocated;
|
|
|
|
return bResult;
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// WildcardMatch
|
|
//
|
|
// We go through extra effort below to avoid doing memory allocation in most cases.
|
|
//
|
|
EASTDC_API bool WildcardMatch(const char32_t* pString, const char32_t* pPattern, bool bCaseSensitive)
|
|
{
|
|
if(bCaseSensitive)
|
|
return MatchPattern(pString, pPattern);
|
|
else
|
|
{
|
|
// Do efficient string conversion to lower case...
|
|
char32_t pStringLBuffer[384];
|
|
char32_t* pStringL;
|
|
char32_t* pStringLAllocated;
|
|
size_t nStringLLength = Strlen(pString);
|
|
|
|
if(nStringLLength >= (sizeof(pStringLBuffer) / sizeof(pStringLBuffer[0]) - 1))
|
|
{
|
|
pStringLAllocated = EASTDC_NEW("EATextUtil/StringAllocated/char32[]") char32_t[nStringLLength + 1];
|
|
pStringL = pStringLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pStringLAllocated = NULL;
|
|
pStringL = pStringLBuffer;
|
|
}
|
|
Strcpy(pStringL, pString);
|
|
Strlwr(pStringL);
|
|
|
|
// Do efficient pattern conversion to lower case...
|
|
char32_t pPatternLBuffer[32];
|
|
char32_t* pPatternL;
|
|
char32_t* pPatternLAllocated;
|
|
size_t nPatternLLength = Strlen(pPattern);
|
|
|
|
if(nPatternLLength >= (sizeof(pPatternLBuffer) / sizeof(pPatternLBuffer[0]) - 1))
|
|
{
|
|
pPatternLAllocated = EASTDC_NEW("EATextUtil/PatternAllocated/char32[]") char32_t[nPatternLLength + 1];
|
|
pPatternL = pPatternLAllocated;
|
|
}
|
|
else
|
|
{
|
|
pPatternLAllocated = NULL;
|
|
pPatternL = pPatternLBuffer;
|
|
}
|
|
Strcpy(pPatternL, pPattern);
|
|
Strlwr(pPatternL);
|
|
|
|
const bool bResult = MatchPattern(pStringL, pPatternL);
|
|
|
|
delete[] pStringLAllocated; // In most cases, this will be NULL and there will be no effect.
|
|
delete[] pPatternLAllocated;
|
|
|
|
return bResult;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// GetTextLine
|
|
//
|
|
EASTDC_API const char* GetTextLine(const char* pText, const char* pTextEnd, const char** ppNewText)
|
|
{
|
|
if(pText < pTextEnd)
|
|
{
|
|
while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
|
|
++pText;
|
|
|
|
if(ppNewText)
|
|
{
|
|
*ppNewText = pText;
|
|
|
|
if(*ppNewText < pTextEnd)
|
|
{
|
|
if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
|
|
++*ppNewText;
|
|
}
|
|
}
|
|
}
|
|
else if(ppNewText)
|
|
*ppNewText = pTextEnd;
|
|
|
|
return pText;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// GetTextLine
|
|
//
|
|
EASTDC_API const char16_t* GetTextLine(const char16_t* pText, const char16_t* pTextEnd, const char16_t** ppNewText)
|
|
{
|
|
if(pText < pTextEnd)
|
|
{
|
|
while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
|
|
++pText;
|
|
|
|
if(ppNewText)
|
|
{
|
|
*ppNewText = pText;
|
|
|
|
if(*ppNewText < pTextEnd)
|
|
{
|
|
if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
|
|
++*ppNewText;
|
|
}
|
|
}
|
|
}
|
|
else if(ppNewText)
|
|
*ppNewText = pTextEnd;
|
|
|
|
return pText;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// GetTextLine
|
|
//
|
|
EASTDC_API const char32_t* GetTextLine(const char32_t* pText, const char32_t* pTextEnd, const char32_t** ppNewText)
|
|
{
|
|
if(pText < pTextEnd)
|
|
{
|
|
while((pText < pTextEnd) && (*pText != '\r') && (*pText != '\n'))
|
|
++pText;
|
|
|
|
if(ppNewText)
|
|
{
|
|
*ppNewText = pText;
|
|
|
|
if(*ppNewText < pTextEnd)
|
|
{
|
|
if((++*ppNewText < pTextEnd) && (**ppNewText ^ *pText) == ('\r' ^ '\n'))
|
|
++*ppNewText;
|
|
}
|
|
}
|
|
}
|
|
else if(ppNewText)
|
|
*ppNewText = pTextEnd;
|
|
|
|
return pText;
|
|
}
|
|
|
|
|
|
|
|
|
|
EASTDC_API bool ParseDelimitedText(const char* pText, const char* pTextEnd, char cDelimiter,
|
|
const char*& pToken, const char*& pTokenEnd, const char** ppNewText)
|
|
{
|
|
int nQuoteLevel = 0;
|
|
bool bDelimiterFound = false;
|
|
|
|
// We remove leading spaces.
|
|
for(pToken = pText; pToken < pTextEnd; ++pToken)
|
|
{
|
|
if((*pToken != ' ') && (*pToken != '\t'))
|
|
break;
|
|
}
|
|
|
|
for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
|
|
{
|
|
const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
|
|
|
|
if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
|
|
bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
|
|
else
|
|
bDelimiterFound = (*pTokenEnd == cDelimiter);
|
|
|
|
if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
|
|
{
|
|
if(!bDelimiterFound)
|
|
++pTokenEnd;
|
|
|
|
const bool bInQuotes = ((nQuoteLevel & 1) != 0);
|
|
|
|
if(!bInQuotes || bLastCharacter) // If not within a quoted section...
|
|
{
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
|
|
{
|
|
// Eliminate spaces before the trailing delimiter.
|
|
while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
|
|
pTokenEnd--;
|
|
}
|
|
|
|
if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
|
|
{
|
|
pToken++;
|
|
pTokenEnd--;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
else if(*pTokenEnd == '"')
|
|
nQuoteLevel++;
|
|
}
|
|
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// ParseDelimitedText
|
|
//
|
|
// This function takes a line text that has fields separated by delimiters
|
|
// and parses the line into the component fields. It is common to read
|
|
// command lines like this or to parse ini file settings like this.
|
|
//
|
|
EASTDC_API bool ParseDelimitedText(const char16_t* pText, const char16_t* pTextEnd, char16_t cDelimiter,
|
|
const char16_t*& pToken, const char16_t*& pTokenEnd, const char16_t** ppNewText)
|
|
{
|
|
int nQuoteLevel = 0;
|
|
bool bDelimiterFound = false;
|
|
|
|
// We remove leading spaces.
|
|
for(pToken = pText; pToken < pTextEnd; ++pToken)
|
|
{
|
|
if((*pToken != ' ') && (*pToken != '\t'))
|
|
break;
|
|
}
|
|
|
|
for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
|
|
{
|
|
const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
|
|
|
|
if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
|
|
bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
|
|
else
|
|
bDelimiterFound = (*pTokenEnd == cDelimiter);
|
|
|
|
if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
|
|
{
|
|
if(!bDelimiterFound)
|
|
++pTokenEnd;
|
|
|
|
const bool bInQuotes = ((nQuoteLevel & 1) != 0);
|
|
|
|
if(!bInQuotes || bLastCharacter) // If not within a quoted section...
|
|
{
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
|
|
{
|
|
// Eliminate spaces before the trailing delimiter.
|
|
while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
|
|
pTokenEnd--;
|
|
}
|
|
|
|
if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
|
|
{
|
|
pToken++;
|
|
pTokenEnd--;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
else if(*pTokenEnd == '"')
|
|
nQuoteLevel++;
|
|
}
|
|
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// ParseDelimitedText
|
|
//
|
|
// This function takes a line text that has fields separated by delimiters
|
|
// and parses the line into the component fields. It is common to read
|
|
// command lines like this or to parse ini file settings like this.
|
|
//
|
|
EASTDC_API bool ParseDelimitedText(const char32_t* pText, const char32_t* pTextEnd, char32_t cDelimiter,
|
|
const char32_t*& pToken, const char32_t*& pTokenEnd, const char32_t** ppNewText)
|
|
{
|
|
int nQuoteLevel = 0;
|
|
bool bDelimiterFound = false;
|
|
|
|
// We remove leading spaces.
|
|
for(pToken = pText; pToken < pTextEnd; ++pToken)
|
|
{
|
|
if((*pToken != ' ') && (*pToken != '\t'))
|
|
break;
|
|
}
|
|
|
|
for(pTokenEnd = pToken; pTokenEnd < pTextEnd; ++pTokenEnd)
|
|
{
|
|
const bool bLastCharacter = ((pTokenEnd + 1) == pTextEnd);
|
|
|
|
if(cDelimiter == ' ') // The space char delimiter is a special case that means delimit by whitespace.
|
|
bDelimiterFound = ((*pTokenEnd == ' ') || (*pTokenEnd == '\t'));
|
|
else
|
|
bDelimiterFound = (*pTokenEnd == cDelimiter);
|
|
|
|
if(bDelimiterFound || bLastCharacter) // If we found a delimiter or if we are on the last character...
|
|
{
|
|
if(!bDelimiterFound)
|
|
++pTokenEnd;
|
|
|
|
const bool bInQuotes = ((nQuoteLevel & 1) != 0);
|
|
|
|
if(!bInQuotes || bLastCharacter) // If not within a quoted section...
|
|
{
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
if((cDelimiter != ' ') && (pTokenEnd != pTextEnd))
|
|
{
|
|
// Eliminate spaces before the trailing delimiter.
|
|
while((pTokenEnd != pToken) && ((pTokenEnd[-1] == ' ') || (pTokenEnd[-1] == '\t')))
|
|
pTokenEnd--;
|
|
}
|
|
|
|
if((pToken != pTextEnd) && (*pToken == '"') && (pTokenEnd[-1] == '"'))
|
|
{
|
|
pToken++;
|
|
pTokenEnd--;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
else if(*pTokenEnd == '"')
|
|
nQuoteLevel++;
|
|
}
|
|
|
|
if(ppNewText)
|
|
*ppNewText = pTokenEnd;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// ConvertBinaryDataToASCIIArray
|
|
//
|
|
// Since every binary byte converts to exactly 2 ascii bytes, the ASCII
|
|
// array must have space for at least twice the amount of bytes
|
|
// as 'nBinaryDataLength' + 1.
|
|
//
|
|
EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char* pASCIIArray)
|
|
{
|
|
const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
|
|
const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
|
|
|
|
while(pBinaryData < pEnd)
|
|
{
|
|
*pASCIIArray = (char)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
*pASCIIArray = (char)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
pBinaryData++;
|
|
}
|
|
|
|
*pASCIIArray = '\0';
|
|
}
|
|
|
|
EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char16_t* pASCIIArray)
|
|
{
|
|
const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
|
|
const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
|
|
|
|
while(pBinaryData < pEnd)
|
|
{
|
|
*pASCIIArray = (char16_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
*pASCIIArray = (char16_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
pBinaryData++;
|
|
}
|
|
|
|
*pASCIIArray = '\0';
|
|
}
|
|
|
|
EASTDC_API void ConvertBinaryDataToASCIIArray(const void* pBinaryData_, size_t nBinaryDataLength, char32_t* pASCIIArray)
|
|
{
|
|
const uint8_t* pBinaryData = (uint8_t*)pBinaryData_;
|
|
const uint8_t* pEnd = pBinaryData + nBinaryDataLength;
|
|
|
|
while(pBinaryData < pEnd)
|
|
{
|
|
*pASCIIArray = (char32_t)('0' + ((*pBinaryData & 0xf0) >> 4)); // Convert the high byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
*pASCIIArray = (char32_t)('0' + (*pBinaryData & 0x0f)); // Convert the low byte to a number between 1 and 15.
|
|
if(*pASCIIArray > '9')
|
|
*pASCIIArray += 7; // Convert the ':' to 'A', for example.
|
|
pASCIIArray++;
|
|
pBinaryData++;
|
|
}
|
|
|
|
*pASCIIArray = '\0';
|
|
}
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// ConvertASCIIArrayToBinaryData (8 bit version)
|
|
//
|
|
// We have a boolean return value because it is possible that the ascii data is
|
|
// corrupt. We check for this corruption and return false if so, while converting
|
|
// all corrupt bytes to valid ones.
|
|
//
|
|
EASTDC_API bool ConvertASCIIArrayToBinaryData(const char* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
|
|
{
|
|
uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
|
|
const char* pEnd = pASCIIArray + nASCIIArrayLength;
|
|
char cTemp;
|
|
bool bReturnValue(true);
|
|
|
|
while(pASCIIArray < pEnd)
|
|
{
|
|
*pBinaryData8 = 0;
|
|
|
|
for(int j = 4; j >= 0; j -= 4)
|
|
{
|
|
cTemp = *pASCIIArray;
|
|
|
|
if(cTemp < '0') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp > 'F') // Do some bounds checking.
|
|
{
|
|
if(cTemp >= 'a' && cTemp <= 'f')
|
|
cTemp -= 39; // Convert 'a' to ':'.
|
|
else
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
}
|
|
else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp >= 'A')
|
|
cTemp -= 7;
|
|
|
|
*pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
|
|
pASCIIArray++;
|
|
}
|
|
|
|
pBinaryData8++;
|
|
}
|
|
|
|
return bReturnValue;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// ConvertASCIIArrayToBinaryData (16 bit version)
|
|
//
|
|
// We have a boolean return value because it is possible that the ascii data is
|
|
// corrupt. We check for this corruption and return false if so, while converting
|
|
// all corrupt bytes to valid ones.
|
|
//
|
|
EASTDC_API bool ConvertASCIIArrayToBinaryData(const char16_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
|
|
{
|
|
uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
|
|
const char16_t* pEnd = pASCIIArray + nASCIIArrayLength;
|
|
char16_t cTemp;
|
|
bool bReturnValue(true);
|
|
|
|
while(pASCIIArray < pEnd)
|
|
{
|
|
*pBinaryData8 = 0;
|
|
|
|
for(int j = 4; j >= 0; j -= 4)
|
|
{
|
|
cTemp = *pASCIIArray;
|
|
|
|
if(cTemp < '0') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp > 'F') // Do some bounds checking.
|
|
{
|
|
if(cTemp >= 'a' && cTemp <= 'f')
|
|
cTemp -= 39; // Convert 'a' to ':'.
|
|
else
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
}
|
|
else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp >= 'A')
|
|
cTemp -= 7;
|
|
|
|
*pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
|
|
pASCIIArray++;
|
|
}
|
|
|
|
pBinaryData8++;
|
|
}
|
|
|
|
return bReturnValue;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// ConvertASCIIArrayToBinaryData (32 bit version)
|
|
//
|
|
// We have a boolean return value because it is possible that the ascii data is
|
|
// corrupt. We check for this corruption and return false if so, while converting
|
|
// all corrupt bytes to valid ones.
|
|
//
|
|
EASTDC_API bool ConvertASCIIArrayToBinaryData(const char32_t* pASCIIArray, size_t nASCIIArrayLength, void* pBinaryData)
|
|
{
|
|
uint8_t* pBinaryData8 = (uint8_t*)pBinaryData;
|
|
const char32_t* pEnd = pASCIIArray + nASCIIArrayLength;
|
|
char32_t cTemp;
|
|
bool bReturnValue(true);
|
|
|
|
while(pASCIIArray < pEnd)
|
|
{
|
|
*pBinaryData8 = 0;
|
|
|
|
for(int j = 4; j >= 0; j -= 4)
|
|
{
|
|
cTemp = *pASCIIArray;
|
|
|
|
if(cTemp < '0') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp > 'F') // Do some bounds checking.
|
|
{
|
|
if(cTemp >= 'a' && cTemp <= 'f')
|
|
cTemp -= 39; // Convert 'a' to ':'.
|
|
else
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
}
|
|
else if(cTemp > '9' && cTemp < 'A') // Do some bounds checking.
|
|
{
|
|
cTemp = '0';
|
|
bReturnValue = false;
|
|
}
|
|
else if(cTemp >= 'A')
|
|
cTemp -= 7;
|
|
|
|
*pBinaryData8 = (uint8_t)(*pBinaryData8 + ((cTemp - '0') << j));
|
|
pASCIIArray++;
|
|
}
|
|
|
|
pBinaryData8++;
|
|
}
|
|
|
|
return bReturnValue;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenDelimited (8 bit version)
|
|
//
|
|
EASTDC_API bool SplitTokenDelimited(const char* pSource, size_t nSourceLength, char cDelimiter,
|
|
char* pToken, size_t nTokenLength, const char** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
if(pToken && nTokenLength)
|
|
*pToken = 0;
|
|
|
|
if(pSource && nSourceLength && *pSource)
|
|
{
|
|
// look for the delimiter
|
|
for(size_t i = 0; i < nSourceLength && *pSource; i++)
|
|
{
|
|
const char cTemp(*pSource);
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
if(cTemp == cDelimiter) // If there is a delimiter match...
|
|
break; // We are done.
|
|
else
|
|
{
|
|
// keep moving characters into the token until we find the delimiter or reached the end of the token string
|
|
if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
|
|
{
|
|
*pToken = cTemp; // add the character
|
|
pToken++; // increment the token pointer
|
|
*pToken = 0; // insert terminating null character
|
|
}
|
|
|
|
pSource++; // increment source pointer
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenDelimited (16 bit version)
|
|
//
|
|
// Implemented by Blazej Stompel and Paul Pedriana
|
|
//
|
|
EASTDC_API bool SplitTokenDelimited(const char16_t* pSource, size_t nSourceLength, char16_t cDelimiter,
|
|
char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
if(pToken && nTokenLength)
|
|
*pToken = 0;
|
|
|
|
if(pSource && nSourceLength && *pSource)
|
|
{
|
|
// look for the delimiter
|
|
for(size_t i = 0; i < nSourceLength && *pSource; i++)
|
|
{
|
|
const char16_t cTemp(*pSource);
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
if(cTemp == cDelimiter) // If there is a delimiter match...
|
|
break; // We are done.
|
|
else
|
|
{
|
|
// keep moving characters into the token until we find the delimiter or reached the end of the token string
|
|
if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
|
|
{
|
|
*pToken = cTemp; // add the character
|
|
pToken++; // increment the token pointer
|
|
*pToken = 0; // insert terminating null character
|
|
}
|
|
|
|
pSource++; // increment source pointer
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenDelimited (32 bit version)
|
|
//
|
|
// Implemented by Blazej Stompel and Paul Pedriana
|
|
//
|
|
EASTDC_API bool SplitTokenDelimited(const char32_t* pSource, size_t nSourceLength, char32_t cDelimiter,
|
|
char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
if(pToken && nTokenLength)
|
|
*pToken = 0;
|
|
|
|
if(pSource && nSourceLength && *pSource)
|
|
{
|
|
// look for the delimiter
|
|
for(size_t i = 0; i < nSourceLength && *pSource; i++)
|
|
{
|
|
const char32_t cTemp(*pSource);
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
if(cTemp == cDelimiter) // If there is a delimiter match...
|
|
break; // We are done.
|
|
else
|
|
{
|
|
// keep moving characters into the token until we find the delimiter or reached the end of the token string
|
|
if(pToken && ((i + 1) < nTokenLength)) // we need an extra character for terminating null
|
|
{
|
|
*pToken = cTemp; // add the character
|
|
pToken++; // increment the token pointer
|
|
*pToken = 0; // insert terminating null character
|
|
}
|
|
|
|
pSource++; // increment source pointer
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenSeparated (8 bit version)
|
|
//
|
|
EASTDC_API bool SplitTokenSeparated(const char* pSource, size_t nSourceLength, char c,
|
|
char* pToken, size_t nTokenLength, const char** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
|
|
if(pToken && nTokenLength)
|
|
*pToken = '\0';
|
|
|
|
if(pSource)
|
|
{
|
|
// keep track of how many characters we have written to the token buffer
|
|
size_t nTokenIndex = 0;
|
|
|
|
// keep track whether we found the token and if we are done reading it
|
|
bool bFoundToken = false;
|
|
bool bReadToken = false;
|
|
|
|
// look for the separators
|
|
for(size_t i = 0; i < nSourceLength; i++)
|
|
{
|
|
// get the character
|
|
const char cTemp(*pSource);
|
|
|
|
// quit if we found the terminating null character
|
|
if(cTemp != '\0')
|
|
{
|
|
// is the character not a separator ?
|
|
if(cTemp != c)
|
|
{
|
|
// we have a token
|
|
bFoundToken = true;
|
|
|
|
// were we done reading the token ?
|
|
if(bReadToken)
|
|
return true;
|
|
else
|
|
{
|
|
// add the character to the token
|
|
if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
|
|
{
|
|
// add the character
|
|
*pToken = cTemp;
|
|
|
|
// increment the token pointer
|
|
pToken++;
|
|
|
|
// and index
|
|
nTokenIndex++;
|
|
|
|
// insert terminating null character
|
|
*pToken = '\0';
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// the character is a separator - if we found our token then we are done reading it
|
|
if(bFoundToken)
|
|
bReadToken = true;
|
|
}
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
// increment source pointer
|
|
pSource++;
|
|
}
|
|
else
|
|
{
|
|
// we have reached the end of the string
|
|
break;
|
|
}
|
|
}
|
|
|
|
return bFoundToken;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenSeparated (16 bit version)
|
|
//
|
|
// Implemented by Blazej Stompel
|
|
//
|
|
// Unit test can be found in Foundation\Test\UnitTests
|
|
//
|
|
EASTDC_API bool SplitTokenSeparated(const char16_t* pSource, size_t nSourceLength, char16_t c,
|
|
char16_t* pToken, size_t nTokenLength, const char16_t** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
if(pToken && nTokenLength)
|
|
*pToken = '\0';
|
|
|
|
if(pSource)
|
|
{
|
|
// keep track of how many characters we have written to the token buffer
|
|
size_t nTokenIndex = 0;
|
|
|
|
// keep track whether we found the token and if we are done reading it
|
|
bool bFoundToken = false;
|
|
bool bReadToken = false;
|
|
|
|
// look for the separators
|
|
for(size_t i = 0; i < nSourceLength; i++)
|
|
{
|
|
// get the character
|
|
const char16_t cTemp(*pSource);
|
|
|
|
// quit if we found the terminating null character
|
|
if(cTemp != '\0')
|
|
{
|
|
// is the character not a separator ?
|
|
if(cTemp != c)
|
|
{
|
|
// we have a token
|
|
bFoundToken = true;
|
|
|
|
// were we done reading the token ?
|
|
if(bReadToken)
|
|
return true;
|
|
else
|
|
{
|
|
// add the character to the token
|
|
if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
|
|
{
|
|
// add the character
|
|
*pToken = cTemp;
|
|
|
|
// increment the token pointer
|
|
pToken++;
|
|
|
|
// and index
|
|
nTokenIndex++;
|
|
|
|
// insert terminating null character
|
|
*pToken = '\0';
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// the character is a separator - if we found our token then we are done reading it
|
|
if(bFoundToken)
|
|
bReadToken = true;
|
|
}
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
// increment source pointer
|
|
pSource++;
|
|
}
|
|
else
|
|
{
|
|
// we have reached the end of the string
|
|
break;
|
|
}
|
|
}
|
|
|
|
return bFoundToken;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// SplitTokenSeparated (32 bit version)
|
|
//
|
|
// Implemented by Blazej Stompel
|
|
//
|
|
// Unit test can be found in Foundation\Test\UnitTests
|
|
//
|
|
EASTDC_API bool SplitTokenSeparated(const char32_t* pSource, size_t nSourceLength, char32_t c,
|
|
char32_t* pToken, size_t nTokenLength, const char32_t** ppNewSource)
|
|
{
|
|
// terminate the token (so it appears empty if we don't find anything)
|
|
if(pToken && nTokenLength)
|
|
*pToken = '\0';
|
|
|
|
if(pSource)
|
|
{
|
|
// keep track of how many characters we have written to the token buffer
|
|
size_t nTokenIndex = 0;
|
|
|
|
// keep track whether we found the token and if we are done reading it
|
|
bool bFoundToken = false;
|
|
bool bReadToken = false;
|
|
|
|
// look for the separators
|
|
for(size_t i = 0; i < nSourceLength; i++)
|
|
{
|
|
// get the character
|
|
const char32_t cTemp(*pSource);
|
|
|
|
// quit if we found the terminating null character
|
|
if(cTemp != '\0')
|
|
{
|
|
// is the character not a separator ?
|
|
if(cTemp != c)
|
|
{
|
|
// we have a token
|
|
bFoundToken = true;
|
|
|
|
// were we done reading the token ?
|
|
if(bReadToken)
|
|
return true;
|
|
else
|
|
{
|
|
// add the character to the token
|
|
if(pToken && (nTokenIndex + 1) < nTokenLength) // we need an extra character for terminating null
|
|
{
|
|
// add the character
|
|
*pToken = cTemp;
|
|
|
|
// increment the token pointer
|
|
pToken++;
|
|
|
|
// and index
|
|
nTokenIndex++;
|
|
|
|
// insert terminating null character
|
|
*pToken = '\0';
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// the character is a separator - if we found our token then we are done reading it
|
|
if(bFoundToken)
|
|
bReadToken = true;
|
|
}
|
|
|
|
// update new source pointer if present
|
|
if(ppNewSource)
|
|
(*ppNewSource)++;
|
|
|
|
// increment source pointer
|
|
pSource++;
|
|
}
|
|
else
|
|
{
|
|
// we have reached the end of the string
|
|
break;
|
|
}
|
|
}
|
|
|
|
return bFoundToken;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// Boyer-Moore string search
|
|
//
|
|
// This is the "turbo" implementation defined at http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140.
|
|
// Boyer-Moore is a very fast string search compared to most others, including
|
|
// those in the STL. However, you need to be searching a string of at least 100
|
|
// chars and have a search pattern of at least 3 characters for the speed to show,
|
|
// as Boyer-Moore has a startup precalculation that costs some cycles.
|
|
// This startup precalculation is proportional to the size of your search pattern
|
|
// and the size of the alphabet in use. Thus, doing Boyer-Moore searches on the
|
|
// entire Unicode alphabet is going to incur a fairly expensive precalculation cost.
|
|
//
|
|
|
|
// This is a private function used by BoyerMooreSearch.
|
|
//
|
|
static void BoyerMooreBadCharacterCalc(const char* pPattern, int nPatternLength,
|
|
int* pAlphabetBuffer, int nAlphabetBufferSize)
|
|
{
|
|
int i;
|
|
|
|
for(i = 0; i < nAlphabetBufferSize; ++i)
|
|
pAlphabetBuffer[i] = nPatternLength;
|
|
|
|
for(i = 0; i < (nPatternLength - 1); ++i)
|
|
pAlphabetBuffer[(int)pPattern[i]] = (nPatternLength - i) - 1;
|
|
}
|
|
|
|
|
|
// This is a private function used by BoyerMooreSearch.
|
|
//
|
|
static void BoyerMooreGoodSuffixCalc(const char* pPattern, int nPatternLength,
|
|
int* pPatternBuffer1, int* pPatternBuffer2)
|
|
{
|
|
int i;
|
|
int j = 0;
|
|
int f = 0;
|
|
int g = nPatternLength - 1;
|
|
|
|
pPatternBuffer2[nPatternLength - 1] = nPatternLength;
|
|
|
|
for(i = nPatternLength - 2; i >= 0; --i)
|
|
{
|
|
if((i > g) && pPatternBuffer2[((i + nPatternLength) - 1) - f] < (i - g))
|
|
pPatternBuffer2[i] = pPatternBuffer2[((i + nPatternLength) - 1) - f];
|
|
else
|
|
{
|
|
if(i < g)
|
|
g = i;
|
|
|
|
f = i;
|
|
|
|
while((g >= 0) && (pPattern[g] == pPattern[((g + nPatternLength) - 1) - f]))
|
|
--g;
|
|
|
|
pPatternBuffer2[i] = f - g;
|
|
}
|
|
}
|
|
|
|
for(i = 0; i < nPatternLength; ++i)
|
|
pPatternBuffer1[i] = nPatternLength;
|
|
|
|
for(i = nPatternLength - 1; i >= -1; --i)
|
|
{
|
|
if((i == -1) || (pPatternBuffer2[i] == (i + 1)))
|
|
{
|
|
for(; j < (nPatternLength - 1) - i; ++j)
|
|
{
|
|
if(pPatternBuffer1[j] == nPatternLength)
|
|
pPatternBuffer1[j] = (nPatternLength - 1) - i;
|
|
}
|
|
}
|
|
}
|
|
|
|
for(i = 0; i <= nPatternLength - 2; ++i)
|
|
pPatternBuffer1[(nPatternLength - 1) - pPatternBuffer2[i]] = (nPatternLength - 1) - i;
|
|
}
|
|
|
|
|
|
|
|
// Argument specification.
|
|
//
|
|
// patternBuffer1 is a user-supplied buffer and must be at least as long as the search pattern.
|
|
// patternBuffer2 is a user-supplied buffer and must be at least as long as the search pattern.
|
|
// alphabetBuffer is a user-supplied buffer and must be at least as long as the highest character value used in the searched string and search pattern.
|
|
//
|
|
EASTDC_API int BoyerMooreSearch(const char* pPattern, int nPatternLength, const char* pSearchString, int nSearchStringLength,
|
|
int* pPatternBuffer1, int* pPatternBuffer2, int* pAlphabetBuffer, int nAlphabetBufferSize)
|
|
{
|
|
// Do precalculations
|
|
BoyerMooreGoodSuffixCalc(pPattern, nPatternLength, pPatternBuffer1, pPatternBuffer2);
|
|
BoyerMooreBadCharacterCalc(pPattern, nPatternLength, pAlphabetBuffer, nAlphabetBufferSize);
|
|
|
|
// Do search
|
|
for(int j = 0, shift = nPatternLength, u = 0; j <= (nSearchStringLength - nPatternLength); j += shift)
|
|
{
|
|
int i = nPatternLength - 1;
|
|
|
|
while((i >= 0) && (pPattern[i] == pSearchString[i + j]))
|
|
{
|
|
--i;
|
|
|
|
if((u != 0) && (i == (nPatternLength - 1) - shift))
|
|
i -= u;
|
|
}
|
|
|
|
if(i < 0)
|
|
{
|
|
return j;
|
|
|
|
// Only used if we were iterating multiple found items:
|
|
//shift = pPatternBuffer1[0];
|
|
//u = nPatternLength - shift;
|
|
}
|
|
else
|
|
{
|
|
const int v = nPatternLength - 1 - i;
|
|
const int turboShift = u - v;
|
|
const int bcShift = pAlphabetBuffer[(int)pSearchString[i + j]] - nPatternLength + 1 + i;
|
|
shift = EATEXTUTIL_MAX(turboShift, bcShift);
|
|
shift = EATEXTUTIL_MAX(shift, pPatternBuffer1[i]);
|
|
|
|
if(shift == pPatternBuffer1[i])
|
|
u = EATEXTUTIL_MIN(nPatternLength - shift, v);
|
|
else
|
|
{
|
|
if(turboShift < bcShift)
|
|
shift = EATEXTUTIL_MAX(shift, u + 1);
|
|
u = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
return nPatternLength;
|
|
}
|
|
|
|
|
|
#undef EATEXTUTIL_MIN
|
|
#undef EATEXTUTIL_MAX
|
|
|
|
|
|
} // namespace StdC
|
|
} // namespace EA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|