00001 #include <stdio.h>
00002 #include <string.h>
00003
00004 char *testStrings[] = {
00005 "You should see the Greek word kosme: \xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5 |\n",
00006 "2.1.1 1 byte (U-00000000): \x00 \n",
00007 "2.1.2 2 bytes (U-00000080): \xc2\x80 |\n",
00008 "2.1.3 3 bytes (U-00000800): \xe0\xa0\x80 |\n",
00009 "2.1.4 4 bytes (U-00010000): \xf0\x90\x80\x80 |\n",
00010 "2.1.5 5 bytes (U-00200000): \xf8\x88\x80\x80\x80 |\n",
00011 "2.1.6 6 bytes (U-04000000): \xfc\x84\x80\x80\x80\x80 |\n",
00012 "2.2.1 1 byte (U-0000007F): \x7f \n",
00013 "2.2.2 2 bytes (U-000007FF): \xdf\xbf |\n",
00014 "2.2.3 3 bytes (U-0000FFFF): \xef\xbf\xbf |\n",
00015 "2.2.4 4 bytes (U-001FFFFF): \xf7\xbf\xbf\xbf |\n",
00016 "2.2.5 5 bytes (U-03FFFFFF): \xfb\xbf\xbf\xbf\xbf |\n",
00017 "2.2.6 6 bytes (U-7FFFFFFF): \xfd\xbf\xbf\xbf\xbf\xbf |\n",
00018 "2.3.1 U-0000D7FF = ed 9f bf = \xed\x9f\xbf |\n",
00019 "2.3.2 U-0000E000 = ee 80 80 = \xee\x80\x80 |\n",
00020 "2.3.3 U-0000FFFD = ef bf bd = \xef\xbf\xbd |\n",
00021 "2.3.4 U-0010FFFF = f4 8f bf bf = \xf4\x8f\xbf\xbf |\n",
00022 "2.3.5 U-00110000 = f4 90 80 80 = \xf4\x90\x80\x80 |\n",
00023 "3 Malformed sequences |\n",
00024 "3.1.1 First continuation byte 0x80: \x80 |\n",
00025 "3.1.2 Last continuation byte 0xbf: \xbf |\n",
00026 "3.1.3 2 continuation bytes: \x80\xbf |\n",
00027 "3.1.4 3 continuation bytes: \x80\xbf\x80 |\n",
00028 "3.1.5 4 continuation bytes: \x80\xbf\x80\xbf |\n",
00029 "3.1.6 5 continuation bytes: \x80\xbf\x80\xbf\x80 |\n",
00030 "3.1.7 6 continuation bytes: \x80\xbf\x80\xbf\x80\xbf |\n",
00031 "3.1.8 7 continuation bytes: \x80\xbf\x80\xbf\x80\xbf\x80 |\n",
00032 " \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f |\n",
00033 " \x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f |\n",
00034 " \xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf |\n",
00035 " \xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf |\n",
00036 " \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf |\n",
00037 " \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf |\n",
00038 " \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef |\n",
00039 " \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 |\n",
00040 " \xf8 \xf9 \xfa \xfb |\n",
00041 " \xfc \xfd |\n",
00042 "3.3.1 2-byte sequence with last byte missing (U+0000): \xc0 |\n",
00043 "3.3.2 3-byte sequence with last byte missing (U+0000): \xe0\x80 |\n",
00044 "3.3.3 4-byte sequence with last byte missing (U+0000): \xf0\x80\x80 |\n",
00045 "3.3.4 5-byte sequence with last byte missing (U+0000): \xf8\x80\x80\x80 |\n",
00046 "3.3.5 6-byte sequence with last byte missing (U+0000): \xfc\x80\x80\x80\x80 |\n",
00047 "3.3.6 2-byte sequence with last byte missing (U-000007FF): \xdf |\n",
00048 "3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \xef\xbf |\n",
00049 "3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \xf7\xbf\xbf |\n",
00050 "3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \xfb\xbf\xbf\xbf |\n",
00051 "3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \xfd\xbf\xbf\xbf\xbf |\n",
00052 " \xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf |\n",
00053 "3.5.1 fe = \xfe |\n",
00054 "3.5.2 ff = \xff |\n",
00055 "3.5.3 fe fe ff ff = \xfe\xfe\xff\xff |\n",
00056 "4.1.1 U+002F = c0 af = \xc0\xaf |\n",
00057 "4.1.2 U+002F = e0 80 af = \xe0\x80\xaf |\n",
00058 "4.1.3 U+002F = f0 80 80 af = \xf0\x80\x80\xaf |\n",
00059 "4.1.4 U+002F = f8 80 80 80 af = \xf8\x80\x80\x80\xaf |\n",
00060 "4.1.5 U+002F = fc 80 80 80 80 af = \xfc\x80\x80\x80\x80\xaf |\n",
00061 "4.2.1 U-0000007F = c1 bf = \xc1\xbf |\n",
00062 "4.2.2 U-000007FF = e0 9f bf = \xe0\x9f\xbf |\n",
00063 "4.2.3 U-0000FFFF = f0 8f bf bf = \xf0\x8f\xbf\xbf |\n",
00064 "4.2.4 U-001FFFFF = f8 87 bf bf bf = \xf8\x87\xbf\xbf\xbf |\n",
00065 "4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = \xfc\x83\xbf\xbf\xbf\xbf |\n",
00066 "4.3.1 U+0000 = c0 80 = \xc0\x80 |\n",
00067 "4.3.2 U+0000 = e0 80 80 = \xe0\x80\x80 |\n",
00068 "4.3.3 U+0000 = f0 80 80 80 = \xf0\x80\x80\x80 |\n",
00069 "4.3.4 U+0000 = f8 80 80 80 80 = \xf8\x80\x80\x80\x80 |\n",
00070 "4.3.5 U+0000 = fc 80 80 80 80 80 = \xfc\x80\x80\x80\x80\x80 |\n",
00071 "5.1.1 U+D800 = ed a0 80 = \xed\xa0\x80 |\n",
00072 "5.1.2 U+DB7F = ed ad bf = \xed\xad\xbf |\n",
00073 "5.1.3 U+DB80 = ed ae 80 = \xed\xae\x80 |\n",
00074 "5.1.4 U+DBFF = ed af bf = \xed\xaf\xbf |\n",
00075 "5.1.5 U+DC00 = ed b0 80 = \xed\xb0\x80 |\n",
00076 "5.1.6 U+DF80 = ed be 80 = \xed\xbe\x80 |\n",
00077 "5.1.7 U+DFFF = ed bf bf = \xed\xbf\xbf |\n",
00078 "5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \xed\xa0\x80\xed\xb0\x80 |\n",
00079 "5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \xed\xa0\x80\xed\xbf\xbf |\n",
00080 "5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \xed\xad\xbf\xed\xb0\x80 |\n",
00081 "5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \xed\xad\xbf\xed\xbf\xbf |\n",
00082 "5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \xed\xae\x80\xed\xb0\x80 |\n",
00083 "5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \xed\xae\x80\xed\xbf\xbf |\n",
00084 "5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \xed\xaf\xbf\xed\xb0\x80 |\n",
00085 "5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \xed\xaf\xbf\xed\xbf\xbf |\n",
00086 "5.3.1 U+FFFE = ef bf be = \xef\xbf\xbe |\n",
00087 "5.3.2 U+FFFF = ef bf bf = \xef\xbf\xbf |\n",
00088 NULL,
00089 };
00090
00091 int utf8_test_valid(const unsigned char *input, const unsigned int length)
00092 {
00093 int current, remaining;
00094 unsigned char mbhead;
00095
00096 for(current = 0; current < length; current++) {
00097 remaining = length - (current + 1);
00098 switch (input[current]) {
00099 case 0x00...0x7F:
00100 break;
00101 case 0x80...0xC1:
00102 return 0;
00103 case 0xC2...0xDF:
00104 if(remaining < 1)
00105 return -1;
00106 if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00107 return 0;
00108 current += 1;
00109 break;
00110 case 0xE0:
00111 if(remaining < 2)
00112 return -(remaining);
00113 if(input[current + 1] < 0xA0 || input[current + 1] > 0xBF)
00114 return 0;
00115 case 0xE1...0xEF:
00116 if(remaining < 2)
00117 return -(remaining);
00118 if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00119 return 0;
00120 if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00121 return 0;
00122 current += 2;
00123 break;
00124 case 0xF0:
00125 if(remaining < 3)
00126 return -(remaining);
00127 if(input[current + 1] < 0x90 || input[current + 1] > 0xBF)
00128 return 0;
00129 case 0xF1...0xF7:
00130 if(remaining < 3)
00131 return -(remaining);
00132 if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00133 return 0;
00134 if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00135 return 0;
00136 if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00137 return 0;
00138 current += 3;
00139 break;
00140 case 0xF8:
00141 if(remaining < 4)
00142 return -(remaining);
00143 if(input[current + 1] < 0x88 || input[current + 1] > 0xBF)
00144 return 0;
00145 case 0xF9...0xFB:
00146 if(remaining < 4)
00147 return -(remaining);
00148 if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00149 return 0;
00150 if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00151 return 0;
00152 if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00153 return 0;
00154 if(input[current + 4] < 0x80 || input[current + 4] > 0xBF)
00155 return 0;
00156 current += 4;
00157 break;
00158 case 0xFC:
00159 if(remaining < 5)
00160 return -(remaining);
00161 if(input[current + 1] < 0x84 || input[current + 1] > 0xBF)
00162 return 0;
00163 case 0xFD:
00164 if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00165 return 0;
00166 if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00167 return 0;
00168 if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00169 return 0;
00170 if(input[current + 4] < 0x80 || input[current + 4] > 0xBF)
00171 return 0;
00172 if(input[current + 5] < 0x80 || input[current + 5] > 0xBF)
00173 return 0;
00174 current += 5;
00175 break;
00176
00177 case 0xFE...0xFF:
00178 return 0;
00179 default:
00180 printf("Didn't handle %08x!\n", input[current]);
00181 }
00182 }
00183 return 1;
00184 }
00185
00186 main()
00187 {
00188 int i, length;
00189 for(i = 0; i < sizeof(testStrings) && testStrings[i] != NULL; i++) {
00190 printf(" %d = %d\n", i,
00191 utf8_test_valid(testStrings[i], strlen(testStrings[i])));
00192 }
00193 }