Skip to content
Snippets Groups Projects
Commit af5faf1a authored by Luigi Scarso's avatar Luigi Scarso
Browse files

Accent 0xFFFD but still error on invalid utf (compatible) (H.Hagen)

[[Split portion of a mixed commit.]]
parent 4c3e74e0
No related branches found
No related tags found
No related merge requests found
2022-08-16 Luigi Scarso <luigi.scarso@gmail.com>
* Accent 0xFFFD but still error on invalid utf (compatible) (H.Hagen)
2022-08-05 Luigi Scarso <luigi.scarso@gmail.com>
* Fixed a missed lua_settop in lua_appendtovlist_callback() (thanks to Hironori KITAGAWA)
......
#ifndef luatex_svn_revision_h
#define luatex_svn_revision_h
#define luatex_svn_revision 7536
#define luatex_svn_revision 7537
#endif
......@@ -32,6 +32,7 @@ static void utf_error(void)
deletions_allowed = true;
}
/*
unsigned str2uni(const unsigned char *k)
{
register int ch;
......@@ -40,7 +41,7 @@ unsigned str2uni(const unsigned char *k)
if ((ch = *text++) < 0x80) {
val = (unsigned) ch;
} else if (ch <= 0xbf) {
/*tex An error that we skip. */
//
} else if (ch <= 0xdf) {
if (*text >= 0x80 && *text < 0xc0)
val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
......@@ -58,18 +59,51 @@ unsigned str2uni(const unsigned char *k)
if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 ||
*text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0)
val = 0xFFFD;
} else {
/*tex
The 5- and 6-byte UTF-8 sequences generate integers that are outside
of the valid UCS range, and therefore unsupported.
*/
}
if (val == 0xFFFD)
utf_error();
return (val);
}
*/
/*
Per August 13 2022 we do the following. We still error on an invalid utf because we
have to remain compatible but accept 0xFFFD as valid now.
*/
unsigned str2uni(const unsigned char *k)
{
int val = -1;
const unsigned char *text = k;
register int ch = *text++;
if (ch < 0x80) {
val = (unsigned) ch;
} else if (ch <= 0xbf) {
/*tex An error. */
} else if (ch <= 0xdf) {
if (*text >= 0x80 && *text < 0xc0) {
val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
}
} else if (ch <= 0xef) {
if (*text >= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
val = (unsigned) (((ch & 0xf) << 12) | ((text[0] & 0x3f) << 6) | (text[1] & 0x3f));
}
} else if (ch <= 0xf7) {
int w = (((ch & 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
w = (w << 6) | ((text[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
w2 = ((text[1] & 0xf) << 6) | (text[2] & 0x3f);
val = (unsigned) (w * 0x400 + w2 + 0x10000);
if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 || *text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0) {
val = -1;
}
}
if (val < 0) {
utf_error();
return 0xFFFD;
} else {
return val;
}
}
/*tex
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment