src/utf8.c

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <string.h>
00003 
00004 char *testStrings[] = {
00005         "You should see the Greek word kosme:       \xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5                          |\n",
00006         "2.1.1  1 byte  (U-00000000):        \x00                                        \n",
00007         "2.1.2  2 bytes (U-00000080):        \xc2\x80                                       |\n",
00008         "2.1.3  3 bytes (U-00000800):        \xe0\xa0\x80                                       |\n",
00009         "2.1.4  4 bytes (U-00010000):        \xf0\x90\x80\x80                                       |\n",
00010         "2.1.5  5 bytes (U-00200000):        \xf8\x88\x80\x80\x80                                       |\n",
00011         "2.1.6  6 bytes (U-04000000):        \xfc\x84\x80\x80\x80\x80                                       |\n",
00012         "2.2.1  1 byte  (U-0000007F):        \x7f                                        \n",
00013         "2.2.2  2 bytes (U-000007FF):        \xdf\xbf                                       |\n",
00014         "2.2.3  3 bytes (U-0000FFFF):        \xef\xbf\xbf                                       |\n",
00015         "2.2.4  4 bytes (U-001FFFFF):        \xf7\xbf\xbf\xbf                                       |\n",
00016         "2.2.5  5 bytes (U-03FFFFFF):        \xfb\xbf\xbf\xbf\xbf                                       |\n",
00017         "2.2.6  6 bytes (U-7FFFFFFF):        \xfd\xbf\xbf\xbf\xbf\xbf                                       |\n",
00018         "2.3.1  U-0000D7FF = ed 9f bf = \xed\x9f\xbf                                            |\n",
00019         "2.3.2  U-0000E000 = ee 80 80 = \xee\x80\x80                                            |\n",
00020         "2.3.3  U-0000FFFD = ef bf bd = \xef\xbf\xbd                                            |\n",
00021         "2.3.4  U-0010FFFF = f4 8f bf bf = \xf4\x8f\xbf\xbf                                         |\n",
00022         "2.3.5  U-00110000 = f4 90 80 80 = \xf4\x90\x80\x80                                         |\n",
00023         "3  Malformed sequences                                                        |\n",
00024         "3.1.1  First continuation byte 0x80: \x80                                      |\n",
00025         "3.1.2  Last  continuation byte 0xbf: \xbf                                      |\n",
00026         "3.1.3  2 continuation bytes: \x80\xbf                                             |\n",
00027         "3.1.4  3 continuation bytes: \x80\xbf\x80                                            |\n",
00028         "3.1.5  4 continuation bytes: \x80\xbf\x80\xbf                                           |\n",
00029         "3.1.6  5 continuation bytes: \x80\xbf\x80\xbf\x80                                          |\n",
00030         "3.1.7  6 continuation bytes: \x80\xbf\x80\xbf\x80\xbf                                         |\n",
00031         "3.1.8  7 continuation bytes: \x80\xbf\x80\xbf\x80\xbf\x80                                        |\n",
00032         "   \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f                                                          |\n",
00033         "    \x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f                                                          |\n",
00034         "    \xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf                                                          |\n",
00035         "    \xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf                                                         |\n",
00036         "   \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf                                           |\n",
00037         "    \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf                                          |\n",
00038         "   \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef                                          |\n",
00039         "   \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7                                                          |\n",
00040         "   \xf8 \xf9 \xfa \xfb                                                                  |\n",
00041         "   \xfc \xfd                                                                      |\n",
00042         "3.3.1  2-byte sequence with last byte missing (U+0000):     \xc0               |\n",
00043         "3.3.2  3-byte sequence with last byte missing (U+0000):     \xe0\x80               |\n",
00044         "3.3.3  4-byte sequence with last byte missing (U+0000):     \xf0\x80\x80               |\n",
00045         "3.3.4  5-byte sequence with last byte missing (U+0000):     \xf8\x80\x80\x80               |\n",
00046         "3.3.5  6-byte sequence with last byte missing (U+0000):     \xfc\x80\x80\x80\x80               |\n",
00047         "3.3.6  2-byte sequence with last byte missing (U-000007FF): \xdf               |\n",
00048         "3.3.7  3-byte sequence with last byte missing (U-0000FFFF): \xef\xbf               |\n",
00049         "3.3.8  4-byte sequence with last byte missing (U-001FFFFF): \xf7\xbf\xbf               |\n",
00050         "3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): \xfb\xbf\xbf\xbf               |\n",
00051         "3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \xfd\xbf\xbf\xbf\xbf               |\n",
00052         "   \xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf                                                               |\n",
00053         "3.5.1  fe = \xfe                                                               |\n",
00054         "3.5.2  ff = \xff                                                               |\n",
00055         "3.5.3  fe fe ff ff = \xfe\xfe\xff\xff                                                   |\n",
00056         "4.1.1 U+002F = c0 af             = \xc0\xaf                                        |\n",
00057         "4.1.2 U+002F = e0 80 af          = \xe0\x80\xaf                                        |\n",
00058         "4.1.3 U+002F = f0 80 80 af       = \xf0\x80\x80\xaf                                        |\n",
00059         "4.1.4 U+002F = f8 80 80 80 af    = \xf8\x80\x80\x80\xaf                                        |\n",
00060         "4.1.5 U+002F = fc 80 80 80 80 af = \xfc\x80\x80\x80\x80\xaf                                        |\n",
00061         "4.2.1  U-0000007F = c1 bf             = \xc1\xbf                                   |\n",
00062         "4.2.2  U-000007FF = e0 9f bf          = \xe0\x9f\xbf                                   |\n",
00063         "4.2.3  U-0000FFFF = f0 8f bf bf       = \xf0\x8f\xbf\xbf                                   |\n",
00064         "4.2.4  U-001FFFFF = f8 87 bf bf bf    = \xf8\x87\xbf\xbf\xbf                                   |\n",
00065         "4.2.5  U-03FFFFFF = fc 83 bf bf bf bf = \xfc\x83\xbf\xbf\xbf\xbf                                   |\n",
00066         "4.3.1  U+0000 = c0 80             = \xc0\x80                                       |\n",
00067         "4.3.2  U+0000 = e0 80 80          = \xe0\x80\x80                                       |\n",
00068         "4.3.3  U+0000 = f0 80 80 80       = \xf0\x80\x80\x80                                       |\n",
00069         "4.3.4  U+0000 = f8 80 80 80 80    = \xf8\x80\x80\x80\x80                                       |\n",
00070         "4.3.5  U+0000 = fc 80 80 80 80 80 = \xfc\x80\x80\x80\x80\x80                                       |\n",
00071         "5.1.1  U+D800 = ed a0 80 = \xed\xa0\x80                                                |\n",
00072         "5.1.2  U+DB7F = ed ad bf = \xed\xad\xbf                                                |\n",
00073         "5.1.3  U+DB80 = ed ae 80 = \xed\xae\x80                                                |\n",
00074         "5.1.4  U+DBFF = ed af bf = \xed\xaf\xbf                                                |\n",
00075         "5.1.5  U+DC00 = ed b0 80 = \xed\xb0\x80                                                |\n",
00076         "5.1.6  U+DF80 = ed be 80 = \xed\xbe\x80                                                |\n",
00077         "5.1.7  U+DFFF = ed bf bf = \xed\xbf\xbf                                                |\n",
00078         "5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = \xed\xa0\x80\xed\xb0\x80                               |\n",
00079         "5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = \xed\xa0\x80\xed\xbf\xbf                               |\n",
00080         "5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = \xed\xad\xbf\xed\xb0\x80                               |\n",
00081         "5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = \xed\xad\xbf\xed\xbf\xbf                               |\n",
00082         "5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = \xed\xae\x80\xed\xb0\x80                               |\n",
00083         "5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = \xed\xae\x80\xed\xbf\xbf                               |\n",
00084         "5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = \xed\xaf\xbf\xed\xb0\x80                               |\n",
00085         "5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = \xed\xaf\xbf\xed\xbf\xbf                               |\n",
00086         "5.3.1  U+FFFE = ef bf be = \xef\xbf\xbe                                                |\n",
00087         "5.3.2  U+FFFF = ef bf bf = \xef\xbf\xbf                                                |\n",
00088         NULL,
00089 };
00090 
00091 int utf8_test_valid(const unsigned char *input, const unsigned int length)
00092 {
00093         int current, remaining;
00094         unsigned char mbhead;
00095 
00096         for(current = 0; current < length; current++) {
00097                 remaining = length - (current + 1);
00098                 switch (input[current]) {
00099                 case 0x00...0x7F:
00100                         break;
00101                 case 0x80...0xC1:
00102                         return 0;
00103                 case 0xC2...0xDF:
00104                         if(remaining < 1)
00105                                 return -1;
00106                         if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00107                                 return 0;
00108                         current += 1;
00109                         break;
00110                 case 0xE0:
00111                         if(remaining < 2)
00112                                 return -(remaining);
00113                         if(input[current + 1] < 0xA0 || input[current + 1] > 0xBF)
00114                                 return 0;
00115                 case 0xE1...0xEF:
00116                         if(remaining < 2)
00117                                 return -(remaining);
00118                         if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00119                                 return 0;
00120                         if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00121                                 return 0;
00122                         current += 2;
00123                         break;
00124                 case 0xF0:
00125                         if(remaining < 3)
00126                                 return -(remaining);
00127                         if(input[current + 1] < 0x90 || input[current + 1] > 0xBF)
00128                                 return 0;
00129                 case 0xF1...0xF7:
00130                         if(remaining < 3)
00131                                 return -(remaining);
00132                         if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00133                                 return 0;
00134                         if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00135                                 return 0;
00136                         if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00137                                 return 0;
00138                         current += 3;
00139                         break;
00140                 case 0xF8:
00141                         if(remaining < 4)
00142                                 return -(remaining);
00143                         if(input[current + 1] < 0x88 || input[current + 1] > 0xBF)
00144                                 return 0;
00145                 case 0xF9...0xFB:
00146                         if(remaining < 4)
00147                                 return -(remaining);
00148                         if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00149                                 return 0;
00150                         if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00151                                 return 0;
00152                         if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00153                                 return 0;
00154                         if(input[current + 4] < 0x80 || input[current + 4] > 0xBF)
00155                                 return 0;
00156                         current += 4;
00157                         break;
00158                 case 0xFC:
00159                         if(remaining < 5)
00160                                 return -(remaining);
00161                         if(input[current + 1] < 0x84 || input[current + 1] > 0xBF)
00162                                 return 0;
00163                 case 0xFD:
00164                         if(input[current + 1] < 0x80 || input[current + 1] > 0xBF)
00165                                 return 0;
00166                         if(input[current + 2] < 0x80 || input[current + 2] > 0xBF)
00167                                 return 0;
00168                         if(input[current + 3] < 0x80 || input[current + 3] > 0xBF)
00169                                 return 0;
00170                         if(input[current + 4] < 0x80 || input[current + 4] > 0xBF)
00171                                 return 0;
00172                         if(input[current + 5] < 0x80 || input[current + 5] > 0xBF)
00173                                 return 0;
00174                         current += 5;
00175                         break;
00176                         /* multibyte */
00177                 case 0xFE...0xFF:
00178                         return 0;
00179                 default:
00180                         printf("Didn't handle %08x!\n", input[current]);
00181                 }
00182         }
00183         return 1;
00184 }
00185 
00186 main()
00187 {
00188         int i, length;
00189         for(i = 0; i < sizeof(testStrings) && testStrings[i] != NULL; i++) {
00190                 printf(" %d = %d\n", i,
00191                            utf8_test_valid(testStrings[i], strlen(testStrings[i])));
00192         }
00193 }

Generated on Mon May 28 04:25:26 2007 for BattletechMUX by  doxygen 1.4.7