--- ie_imp_UTF8.cpp.orig Wed Feb 7 08:55:08 2001 +++ ie_imp_UTF8.cpp Tue Apr 10 23:08:02 2001 @@ -308,8 +308,62 @@ bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, UT_uint32 iNumbytes) { - // TODO: Not yet written - return(false); + bool bSuccess = false; + const unsigned char *p = reinterpret_cast(szBuf); + + while (p < reinterpret_cast(szBuf + iNumbytes)) + { + int len; + + if ((*p & 0x80) == 0) // ASCII + { + ++p; + continue; + } + else if ((*p & 0xc0) == 0x80) // not UTF-8 + { + return false; + } + else if (*p == 0xfe || *p == 0xff) // BOM markers? RFC2279 says illegal + { + UT_DEBUGMSG((" BOM?\n")); + ++p; + continue; + } + else if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence + len = 6; + else if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence + len = 5; + else if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence + len = 4; + else if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence + len = 3; + else if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence + len = 2; + else + { + // the above code covers all cases - if we reach here the logic is wrong + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + return false; + } + + while (--len) + { + ++p; + if (p >= reinterpret_cast(szBuf + iNumbytes)) + { + UT_DEBUGMSG((" out of data!\n")); + break; + } + if ((*p & 0xc0) == 0x80) + bSuccess = true; + else + return false; + } + ++p; + } + + return bSuccess; } bool IE_Imp_UTF8::RecognizeSuffix(const char * szSuffix)