src/libcroco/cr-utils.c

   1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
   2
   3 /*
   4  * This file is part of The Croco Library
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of version 2.1 of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  *
  20  * Author: Dodji Seketeli
  21  * See COPYRIGHTS file for copyright information.
  22  */
  23
  24 #include "cr-utils.h"
  25 #include "cr-string.h"
  26
  27 /**
  28  *@file:
  29  *Some misc utility functions used
  30  *in the libcroco.
  31  *Note that troughout this file I will
  32  *refer to the CSS SPECIFICATIONS DOCUMENTATION
  33  *written by the w3c guys. You can find that document
  34  *at http://www.w3.org/TR/REC-CSS2/ .
  35  */
  36
  37 /****************************
  38  *Encoding transformations and
  39  *encoding helpers
  40  ****************************/
  41
  42 /*
  43  *Here is the correspondance between the ucs-4 charactere codes
  44  *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
  45  *
  46  *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  47  *------------------    -----------------------------
  48  *0000 0000-0000 007F   0xxxxxxx
  49  *0000 0080-0000 07FF   110xxxxx 10xxxxxx
  50  *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
  51  *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  52  *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  53  *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
  54  */
  55
  56 /**
  57  *Given an utf8 string buffer, calculates
  58  *the length of this string if it was encoded
  59  *in ucs4.
  60  *@param a_in_start a pointer to the begining of
  61  *the input utf8 string.
  62  *@param a_in_end a pointre to the end of the input
  63  *utf8 string (points to the last byte of the buffer)
  64  *@param a_len out parameter the calculated length.
  65  *@return CR_OK upon succesfull completion, an error code
  66  *otherwise.
  67  */
  68 enum CRStatus
  69 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
  70                                const guchar * a_in_end, gulong * a_len)
  71 {
  72         guchar *byte_ptr = NULL;
  73         gint len = 0;
  74
  75         /*
  76          *to store the final decoded
  77          *unicode char
  78          */
  79         guint c = 0;
  80
  81         g_return_val_if_fail (a_in_start && a_in_end && a_len,
  82                               CR_BAD_PARAM_ERROR);
  83         *a_len = 0;
  84
  85         for (byte_ptr = (guchar *) a_in_start;
  86              byte_ptr <= a_in_end; byte_ptr++) {
  87                 gint nb_bytes_2_decode = 0;
  88
  89                 if (*byte_ptr <= 0x7F) {
  90                         /*
  91                          *7 bits long char
  92                          *encoded over 1 byte:
  93                          * 0xxx xxxx
  94                          */
  95                         c = *byte_ptr;
  96                         nb_bytes_2_decode = 1;
  97
  98                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
  99                         /*
 100                          *up to 11 bits long char.
 101                          *encoded over 2 bytes:
 102                          *110x xxxx  10xx xxxx
 103                          */
 104                         c = *byte_ptr & 0x1F;
 105                         nb_bytes_2_decode = 2;
 106
 107                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
 108                         /*
 109                          *up to 16 bit long char
 110                          *encoded over 3 bytes:
 111                          *1110 xxxx  10xx xxxx  10xx xxxx
 112                          */
 113                         c = *byte_ptr & 0x0F;
 114                         nb_bytes_2_decode = 3;
 115
 116                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
 117                         /*
 118                          *up to 21 bits long char
 119                          *encoded over 4 bytes:
 120                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 121                          */
 122                         c = *byte_ptr & 0x7;
 123                         nb_bytes_2_decode = 4;
 124
 125                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
 126                         /*
 127                          *up to 26 bits long char
 128                          *encoded over 5 bytes.
 129                          *1111 10xx  10xx xxxx  10xx xxxx
 130                          *10xx xxxx  10xx xxxx
 131                          */
 132                         c = *byte_ptr & 3;
 133                         nb_bytes_2_decode = 5;
 134
 135                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
 136                         /*
 137                          *up to 31 bits long char
 138                          *encoded over 6 bytes:
 139                          *1111 110x  10xx xxxx  10xx xxxx
 140                          *10xx xxxx  10xx xxxx  10xx xxxx
 141                          */
 142                         c = *byte_ptr & 1;
 143                         nb_bytes_2_decode = 6;
 144
 145                 } else {
 146                         /*
 147                          *BAD ENCODING
 148                          */
 149                         return CR_ENCODING_ERROR;
 150                 }
 151
 152                 /*
 153                  *Go and decode the remaining byte(s)
 154                  *(if any) to get the current character.
 155                  */
 156                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 157                         /*decode the next byte */
 158                         byte_ptr++;
 159
 160                         /*byte pattern must be: 10xx xxxx */
 161                         if ((*byte_ptr & 0xC0) != 0x80) {
 162                                 return CR_ENCODING_ERROR;
 163                         }
 164
 165                         c = (c << 6) | (*byte_ptr & 0x3F);
 166                 }
 167
 168                 len++;
 169         }
 170
 171         *a_len = len;
 172
 173         return CR_OK;
 174 }
 175
 176 /**
 177  *Given an ucs4 string, this function
 178  *returns the size (in bytes) this string
 179  *would have occupied if it was encoded in utf-8.
 180  *@param a_in_start a pointer to the beginning of the input
 181  *buffer.
 182  *@param a_in_end a pointer to the end of the input buffer.
 183  *@param a_len out parameter. The computed length.
 184  *@return CR_OK upon successfull completion, an error code otherwise.
 185  */
 186 enum CRStatus
 187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
 188                                const guint32 * a_in_end, gulong * a_len)
 189 {
 190         gint len = 0;
 191         guint32 *char_ptr = NULL;
 192
 193         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 194                               CR_BAD_PARAM_ERROR);
 195
 196         for (char_ptr = (guint32 *) a_in_start;
 197              char_ptr <= a_in_end; char_ptr++) {
 198                 if (*char_ptr <= 0x7F) {
 199                         /*the utf-8 char would take 1 byte */
 200                         len += 1;
 201                 } else if (*char_ptr <= 0x7FF) {
 202                         /*the utf-8 char would take 2 bytes */
 203                         len += 2;
 204                 } else if (*char_ptr <= 0xFFFF) {
 205                         len += 3;
 206                 } else if (*char_ptr <= 0x1FFFFF) {
 207                         len += 4;
 208                 } else if (*char_ptr <= 0x3FFFFFF) {
 209                         len += 5;
 210                 } else if (*char_ptr <= 0x7FFFFFFF) {
 211                         len += 6;
 212                 }
 213         }
 214
 215         *a_len = len;
 216         return CR_OK;
 217 }
 218
 219 /**
 220  *Given an ucsA string, this function
 221  *returns the size (in bytes) this string
 222  *would have occupied if it was encoded in utf-8.
 223  *@param a_in_start a pointer to the beginning of the input
 224  *buffer.
 225  *@param a_in_end a pointer to the end of the input buffer.
 226  *@param a_len out parameter. The computed length.
 227  *@return CR_OK upon successfull completion, an error code otherwise.
 228  */
 229 enum CRStatus
 230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
 231                                const guchar * a_in_end, gulong * a_len)
 232 {
 233         gint len = 0;
 234         guchar *char_ptr = NULL;
 235
 236         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 237                               CR_BAD_PARAM_ERROR);
 238
 239         for (char_ptr = (guchar *) a_in_start;
 240              char_ptr <= a_in_end; char_ptr++) {
 241                 if (*char_ptr <= 0x7F) {
 242                         /*the utf-8 char would take 1 byte */
 243                         len += 1;
 244                 } else {
 245                         /*the utf-8 char would take 2 bytes */
 246                         len += 2;
 247                 }
 248         }
 249
 250         *a_len = len;
 251         return CR_OK;
 252 }
 253
 254 /**
 255  *Converts an utf8 buffer into an ucs4 buffer.
 256  *
 257  *@param a_in the input utf8 buffer to convert.
 258  *@param a_in_len in/out parameter. The size of the
 259  *input buffer to convert. After return, this parameter contains
 260  *the actual number of bytes consumed.
 261  *@param a_out the output converted ucs4 buffer. Must be allocated by
 262  *the caller.
 263  *@param a_out_len in/out parameter. The size of the output buffer.
 264  *If this size is actually smaller than the real needed size, the function
 265  *just converts what it can and returns a success status. After return,
 266  *this param points to the actual number of characters decoded.
 267  *@return CR_OK upon successfull completion, an error code otherwise.
 268  */
 269 enum CRStatus
 270 cr_utils_utf8_to_ucs4 (const guchar * a_in,
 271                        gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
 272 {
 273         gulong in_len = 0,
 274                 out_len = 0,
 275                 in_index = 0,
 276                 out_index = 0;
 277         enum CRStatus status = CR_OK;
 278
 279         /*
 280          *to store the final decoded
 281          *unicode char
 282          */
 283         guint c = 0;
 284
 285         g_return_val_if_fail (a_in && a_in_len
 286                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
 287
 288         if (*a_in_len < 1) {
 289                 status = CR_OK;
 290                 goto end;
 291         }
 292
 293         in_len = *a_in_len;
 294         out_len = *a_out_len;
 295
 296         for (in_index = 0, out_index = 0;
 297              (in_index < in_len) && (out_index < out_len);
 298              in_index++, out_index++) {
 299                 gint nb_bytes_2_decode = 0;
 300
 301                 if (a_in[in_index] <= 0x7F) {
 302                         /*
 303                          *7 bits long char
 304                          *encoded over 1 byte:
 305                          * 0xxx xxxx
 306                          */
 307                         c = a_in[in_index];
 308                         nb_bytes_2_decode = 1;
 309
 310                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
 311                         /*
 312                          *up to 11 bits long char.
 313                          *encoded over 2 bytes:
 314                          *110x xxxx  10xx xxxx
 315                          */
 316                         c = a_in[in_index] & 0x1F;
 317                         nb_bytes_2_decode = 2;
 318
 319                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
 320                         /*
 321                          *up to 16 bit long char
 322                          *encoded over 3 bytes:
 323                          *1110 xxxx  10xx xxxx  10xx xxxx
 324                          */
 325                         c = a_in[in_index] & 0x0F;
 326                         nb_bytes_2_decode = 3;
 327
 328                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
 329                         /*
 330                          *up to 21 bits long char
 331                          *encoded over 4 bytes:
 332                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 333                          */
 334                         c = a_in[in_index] & 0x7;
 335                         nb_bytes_2_decode = 4;
 336
 337                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
 338                         /*
 339                          *up to 26 bits long char
 340                          *encoded over 5 bytes.
 341                          *1111 10xx  10xx xxxx  10xx xxxx
 342                          *10xx xxxx  10xx xxxx
 343                          */
 344                         c = a_in[in_index] & 3;
 345                         nb_bytes_2_decode = 5;
 346
 347                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
 348                         /*
 349                          *up to 31 bits long char
 350                          *encoded over 6 bytes:
 351                          *1111 110x  10xx xxxx  10xx xxxx
 352                          *10xx xxxx  10xx xxxx  10xx xxxx
 353                          */
 354                         c = a_in[in_index] & 1;
 355                         nb_bytes_2_decode = 6;
 356
 357                 } else {
 358                         /*BAD ENCODING */
 359                         goto end;
 360                 }
 361
 362                 /*
 363                  *Go and decode the remaining byte(s)
 364                  *(if any) to get the current character.
 365                  */
 366                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 367                         /*decode the next byte */
 368                         in_index++;
 369
 370                         /*byte pattern must be: 10xx xxxx */
 371                         if ((a_in[in_index] & 0xC0) != 0x80) {
 372                                 goto end;
 373                         }
 374
 375                         c = (c << 6) | (a_in[in_index] & 0x3F);
 376                 }
 377
 378                 /*
 379                  *The decoded ucs4 char is now
 380                  *in c.
 381                  */
 382
 383                 /************************
 384                  *Some security tests
 385                  ***********************/
 386
 387                 /*be sure c is a char */
 388                 if (c == 0xFFFF || c == 0xFFFE)
 389                         goto end;
 390
 391                 /*be sure c is inferior to the max ucs4 char value */
 392                 if (c > 0x10FFFF)
 393                         goto end;
 394
 395                 /*
 396                  *c must be less than UTF16 "lower surrogate begin"
 397                  *or higher than UTF16 "High surrogate end"
 398                  */
 399                 if (c >= 0xD800 && c <= 0xDFFF)
 400                         goto end;
 401
 402                 /*Avoid characters that equals zero */
 403                 if (c == 0)
 404                         goto end;
 405
 406                 a_out[out_index] = c;
 407         }
 408
 409       end:
 410         *a_out_len = out_index + 1;
 411         *a_in_len = in_index + 1;
 412
 413         return status;
 414 }
 415
 416 /**
 417  *Reads a character from an utf8 buffer.
 418  *Actually decode the next character code (unicode character code)
 419  *and returns it.
 420  *@param a_in the starting address of the utf8 buffer.
 421  *@param a_in_len the length of the utf8 buffer.
 422  *@param a_out output parameter. The resulting read char.
 423  *@param a_consumed the number of the bytes consumed to
 424  *decode the returned character code.
 425  *@return CR_OK upon successfull completion, an error code otherwise.
 426  */
 427 enum CRStatus
 428 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
 429                                   gulong a_in_len,
 430                                   guint32 * a_out, gulong * a_consumed)
 431 {
 432         gulong in_len = 0,
 433                 in_index = 0,
 434                 nb_bytes_2_decode = 0;
 435         enum CRStatus status = CR_OK;
 436
 437         /*
 438          *to store the final decoded
 439          *unicode char
 440          */
 441         guint32 c = 0;
 442
 443         g_return_val_if_fail (a_in && a_out && a_out
 444                               && a_consumed, CR_BAD_PARAM_ERROR);
 445
 446         if (a_in_len < 1) {
 447                 status = CR_OK;
 448                 goto end;
 449         }
 450
 451         in_len = a_in_len;
 452
 453         if (*a_in <= 0x7F) {
 454                 /*
 455                  *7 bits long char
 456                  *encoded over 1 byte:
 457                  * 0xxx xxxx
 458                  */
 459                 c = *a_in;
 460                 nb_bytes_2_decode = 1;
 461
 462         } else if ((*a_in & 0xE0) == 0xC0) {
 463                 /*
 464                  *up to 11 bits long char.
 465                  *encoded over 2 bytes:
 466                  *110x xxxx  10xx xxxx
 467                  */
 468                 c = *a_in & 0x1F;
 469                 nb_bytes_2_decode = 2;
 470
 471         } else if ((*a_in & 0xF0) == 0xE0) {
 472                 /*
 473                  *up to 16 bit long char
 474                  *encoded over 3 bytes:
 475                  *1110 xxxx  10xx xxxx  10xx xxxx
 476                  */
 477                 c = *a_in & 0x0F;
 478                 nb_bytes_2_decode = 3;
 479
 480         } else if ((*a_in & 0xF8) == 0xF0) {
 481                 /*
 482                  *up to 21 bits long char
 483                  *encoded over 4 bytes:
 484                  *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 485                  */
 486                 c = *a_in & 0x7;
 487                 nb_bytes_2_decode = 4;
 488
 489         } else if ((*a_in & 0xFC) == 0xF8) {
 490                 /*
 491                  *up to 26 bits long char
 492                  *encoded over 5 bytes.
 493                  *1111 10xx  10xx xxxx  10xx xxxx
 494                  *10xx xxxx  10xx xxxx
 495                  */
 496                 c = *a_in & 3;
 497                 nb_bytes_2_decode = 5;
 498
 499         } else if ((*a_in & 0xFE) == 0xFC) {
 500                 /*
 501                  *up to 31 bits long char
 502                  *encoded over 6 bytes:
 503                  *1111 110x  10xx xxxx  10xx xxxx
 504                  *10xx xxxx  10xx xxxx  10xx xxxx
 505                  */
 506                 c = *a_in & 1;
 507                 nb_bytes_2_decode = 6;
 508
 509         } else {
 510                 /*BAD ENCODING */
 511                 goto end;
 512         }
 513
 514         if (nb_bytes_2_decode > a_in_len) {
 515                 status = CR_END_OF_INPUT_ERROR;
 516                 goto end;
 517         }
 518
 519         /*
 520          *Go and decode the remaining byte(s)
 521          *(if any) to get the current character.
 522          */
 523         for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
 524                 /*byte pattern must be: 10xx xxxx */
 525                 if ((a_in[in_index] & 0xC0) != 0x80) {
 526                         goto end;
 527                 }
 528
 529                 c = (c << 6) | (a_in[in_index] & 0x3F);
 530         }
 531
 532         /*
 533          *The decoded ucs4 char is now
 534          *in c.
 535          */
 536
 537     /************************
 538      *Some security tests
 539      ***********************/
 540
 541         /*be sure c is a char */
 542         if (c == 0xFFFF || c == 0xFFFE)
 543                 goto end;
 544
 545         /*be sure c is inferior to the max ucs4 char value */
 546         if (c > 0x10FFFF)
 547                 goto end;
 548
 549         /*
 550          *c must be less than UTF16 "lower surrogate begin"
 551          *or higher than UTF16 "High surrogate end"
 552          */
 553         if (c >= 0xD800 && c <= 0xDFFF)
 554                 goto end;
 555
 556         /*Avoid characters that equals zero */
 557         if (c == 0)
 558                 goto end;
 559
 560         *a_out = c;
 561
 562       end:
 563         *a_consumed = nb_bytes_2_decode;
 564
 565         return status;
 566 }
 567
 568 /**
 569  *
 570  */
 571 enum CRStatus
 572 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
 573                                const guchar * a_in_end, gulong * a_len)
 574 {
 575         /*
 576          *Note: this function can be made shorter
 577          *but it considers all the cases of the utf8 encoding
 578          *to ease further extensions ...
 579          */
 580
 581         guchar *byte_ptr = NULL;
 582         gint len = 0;
 583
 584         /*
 585          *to store the final decoded
 586          *unicode char
 587          */
 588         guint c = 0;
 589
 590         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 591                               CR_BAD_PARAM_ERROR);
 592         *a_len = 0;
 593
 594         for (byte_ptr = (guchar *) a_in_start;
 595              byte_ptr <= a_in_end; byte_ptr++) {
 596                 gint nb_bytes_2_decode = 0;
 597
 598                 if (*byte_ptr <= 0x7F) {
 599                         /*
 600                          *7 bits long char
 601                          *encoded over 1 byte:
 602                          * 0xxx xxxx
 603                          */
 604                         c = *byte_ptr;
 605                         nb_bytes_2_decode = 1;
 606
 607                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
 608                         /*
 609                          *up to 11 bits long char.
 610                          *encoded over 2 bytes:
 611                          *110x xxxx  10xx xxxx
 612                          */
 613                         c = *byte_ptr & 0x1F;
 614                         nb_bytes_2_decode = 2;
 615
 616                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
 617                         /*
 618                          *up to 16 bit long char
 619                          *encoded over 3 bytes:
 620                          *1110 xxxx  10xx xxxx  10xx xxxx
 621                          */
 622                         c = *byte_ptr & 0x0F;
 623                         nb_bytes_2_decode = 3;
 624
 625                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
 626                         /*
 627                          *up to 21 bits long char
 628                          *encoded over 4 bytes:
 629                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 630                          */
 631                         c = *byte_ptr & 0x7;
 632                         nb_bytes_2_decode = 4;
 633
 634                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
 635                         /*
 636                          *up to 26 bits long char
 637                          *encoded over 5 bytes.
 638                          *1111 10xx  10xx xxxx  10xx xxxx
 639                          *10xx xxxx  10xx xxxx
 640                          */
 641                         c = *byte_ptr & 3;
 642                         nb_bytes_2_decode = 5;
 643
 644                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
 645                         /*
 646                          *up to 31 bits long char
 647                          *encoded over 6 bytes:
 648                          *1111 110x  10xx xxxx  10xx xxxx
 649                          *10xx xxxx  10xx xxxx  10xx xxxx
 650                          */
 651                         c = *byte_ptr & 1;
 652                         nb_bytes_2_decode = 6;
 653
 654                 } else {
 655                         /*
 656                          *BAD ENCODING
 657                          */
 658                         return CR_ENCODING_ERROR;
 659                 }
 660
 661                 /*
 662                  *Go and decode the remaining byte(s)
 663                  *(if any) to get the current character.
 664                  */
 665                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 666                         /*decode the next byte */
 667                         byte_ptr++;
 668
 669                         /*byte pattern must be: 10xx xxxx */
 670                         if ((*byte_ptr & 0xC0) != 0x80) {
 671                                 return CR_ENCODING_ERROR;
 672                         }
 673
 674                         c = (c << 6) | (*byte_ptr & 0x3F);
 675                 }
 676
 677                 /*
 678                  *The decoded ucs4 char is now
 679                  *in c.
 680                  */
 681
 682                 if (c <= 0xFF) { /*Add other conditions to support
 683                                   *other char sets (ucs2, ucs3, ucs4).
 684                                   */
 685                         len++;
 686                 } else {
 687                         /*the char is too long to fit
 688                          *into the supposed charset len.
 689                          */
 690                         return CR_ENCODING_ERROR;
 691                 }
 692         }
 693
 694         *a_len = len;
 695
 696         return CR_OK;
 697 }
 698
 699 /**
 700  *Converts an utf8 string into an ucs4 string.
 701  *@param a_in the input string to convert.
 702  *@param a_in_len in/out parameter. The length of the input
 703  *string. After return, points to the actual number of bytes
 704  *consumed. This can be usefull to debug the input stream in case
 705  *of encoding error.
 706  *@param a_out out parameter. Points to the output string. It is allocated
 707  *by this function and must be freed by the caller.
 708  *@param a_out_len out parameter. The length of the output string.
 709  *@return CR_OK upon successfull completion, an error code otherwise.
 710  *
 711  */
 712 enum CRStatus
 713 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
 714                            gulong * a_in_len,
 715                            guint32 ** a_out, gulong * a_out_len)
 716 {
 717         enum CRStatus status = CR_OK;
 718
 719         g_return_val_if_fail (a_in && a_in_len
 720                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
 721
 722         status = cr_utils_utf8_str_len_as_ucs4 (a_in,
 723                                                 &a_in[*a_in_len - 1],
 724                                                 a_out_len);
 725
 726         g_return_val_if_fail (status == CR_OK, status);
 727
 728         *a_out = (guint32 *) g_malloc0 (*a_out_len * sizeof (guint32));
 729
 730         status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
 731
 732         return status;
 733 }
 734
 735 /**
 736  *Converts an ucs4 buffer into an utf8 buffer.
 737  *
 738  *@param a_in the input ucs4 buffer to convert.
 739  *@param a_in_len in/out parameter. The size of the
 740  *input buffer to convert. After return, this parameter contains
 741  *the actual number of characters consumed.
 742  *@param a_out the output converted utf8 buffer. Must be allocated by
 743  *the caller.
 744  *@param a_out_len in/out parameter. The size of the output buffer.
 745  *If this size is actually smaller than the real needed size, the function
 746  *just converts what it can and returns a success status. After return,
 747  *this param points to the actual number of bytes in the buffer.
 748  *@return CR_OK upon successfull completion, an error code otherwise.
 749  */
 750 enum CRStatus
 751 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
 752                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
 753 {
 754         gulong in_len = 0,
 755                 in_index = 0,
 756                 out_index = 0;
 757         enum CRStatus status = CR_OK;
 758
 759         g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
 760                               CR_BAD_PARAM_ERROR);
 761
 762         if (*a_in_len < 1) {
 763                 status = CR_OK;
 764                 goto end;
 765         }
 766
 767         in_len = *a_in_len;
 768
 769         for (in_index = 0; in_index < in_len; in_index++) {
 770                 /*
 771                  *FIXME: return whenever we encounter forbidden char values.
 772                  */
 773
 774                 if (a_in[in_index] <= 0x7F) {
 775                         a_out[out_index] = a_in[in_index];
 776                         out_index++;
 777                 } else if (a_in[in_index] <= 0x7FF) {
 778                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
 779                         a_out[out_index + 1] =
 780                                 (0x80 | (a_in[in_index] & 0x3F));
 781                         out_index += 2;
 782                 } else if (a_in[in_index] <= 0xFFFF) {
 783                         a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
 784                         a_out[out_index + 1] =
 785                                 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 786                         a_out[out_index + 2] =
 787                                 (0x80 | (a_in[in_index] & 0x3F));
 788                         out_index += 3;
 789                 } else if (a_in[in_index] <= 0x1FFFFF) {
 790                         a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
 791                         a_out[out_index + 1]
 792                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 793                         a_out[out_index + 2]
 794                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 795                         a_out[out_index + 3]
 796                                 = (0x80 | (a_in[in_index] & 0x3F));
 797                         out_index += 4;
 798                 } else if (a_in[in_index] <= 0x3FFFFFF) {
 799                         a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
 800                         a_out[out_index + 1] =
 801                                 (0x80 | (a_in[in_index] >> 18));
 802                         a_out[out_index + 2]
 803                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 804                         a_out[out_index + 3]
 805                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 806                         a_out[out_index + 4]
 807                                 = (0x80 | (a_in[in_index] & 0x3F));
 808                         out_index += 5;
 809                 } else if (a_in[in_index] <= 0x7FFFFFFF) {
 810                         a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
 811                         a_out[out_index + 1] =
 812                                 (0x80 | (a_in[in_index] >> 24));
 813                         a_out[out_index + 2]
 814                                 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
 815                         a_out[out_index + 3]
 816                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 817                         a_out[out_index + 4]
 818                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 819                         a_out[out_index + 4]
 820                                 = (0x80 | (a_in[in_index] & 0x3F));
 821                         out_index += 6;
 822                 } else {
 823                         status = CR_ENCODING_ERROR;
 824                         goto end;
 825                 }
 826         }                       /*end for */
 827
 828       end:
 829         *a_in_len = in_index + 1;
 830         *a_out_len = out_index + 1;
 831
 832         return status;
 833 }
 834
 835 /**
 836  *Converts an ucs4 string into an utf8 string.
 837  *@param a_in the input string to convert.
 838  *@param a_in_len in/out parameter. The length of the input
 839  *string. After return, points to the actual number of characters
 840  *consumed. This can be usefull to debug the input string in case
 841  *of encoding error.
 842  *@param a_out out parameter. Points to the output string. It is allocated
 843  *by this function and must be freed by the caller.
 844  *@param a_out_len out parameter. The length (in bytes) of the output string.
 845  *@return CR_OK upon successfull completion, an error code otherwise.
 846  */
 847 enum CRStatus
 848 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
 849                            gulong * a_in_len,
 850                            guchar ** a_out, gulong * a_out_len)
 851 {
 852         enum CRStatus status = CR_OK;
 853
 854         g_return_val_if_fail (a_in && a_in_len && a_out
 855                               && a_out_len, CR_BAD_PARAM_ERROR);
 856
 857         status = cr_utils_ucs4_str_len_as_utf8 (a_in,
 858                                                 &a_in[*a_out_len - 1],
 859                                                 a_out_len);
 860
 861         g_return_val_if_fail (status == CR_OK, status);
 862
 863         status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
 864
 865         return status;
 866 }
 867
 868 /**
 869  *Converts an ucs1 buffer into an utf8 buffer.
 870  *The caller must know the size of the resulting buffer and
 871  *allocate it prior to calling this function.
 872  *
 873  *@param a_in the input ucs1 buffer.
 874  *
 875  *@param a_in_len in/out parameter. The length of the input buffer.
 876  *After return, points to the number of bytes actually consumed even
 877  *in case of encoding error.
 878  *
 879  *@param a_out out parameter. The output utf8 converted buffer.
 880  *
 881  *@param a_out_len in/out parameter. The size of the output buffer.
 882  *If the output buffer size is shorter than the actual needed size,
 883  *this function just convert what it can.
 884  *
 885  *@return CR_OK upon successfull completion, an error code otherwise.
 886  *
 887  */
 888 enum CRStatus
 889 cr_utils_ucs1_to_utf8 (const guchar * a_in,
 890                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
 891 {
 892         gulong out_index = 0,
 893                 in_index = 0,
 894                 in_len = 0,
 895                 out_len = 0;
 896         enum CRStatus status = CR_OK;
 897
 898         g_return_val_if_fail (a_in && a_in_len
 899                               && a_out_len,
 900                               CR_BAD_PARAM_ERROR);
 901
 902         if (*a_in_len == 0) {
 903                 *a_out_len = 0 ;
 904                 return CR_OK ;
 905         }
 906         g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
 907
 908         if (*a_in_len < 1) {
 909                 status = CR_OK;
 910                 goto end;
 911         }
 912
 913         in_len = *a_in_len;
 914         out_len = *a_out_len;
 915
 916         for (in_index = 0, out_index = 0;
 917              (in_index < in_len) && (out_index < out_len); in_index++) {
 918                 /*
 919                  *FIXME: return whenever we encounter forbidden char values.
 920                  */
 921
 922                 if (a_in[in_index] <= 0x7F) {
 923                         a_out[out_index] = a_in[in_index];
 924                         out_index++;
 925                 } else {
 926                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
 927                         a_out[out_index + 1] =
 928                                 (0x80 | (a_in[in_index] & 0x3F));
 929                         out_index += 2;
 930                 }
 931         }                       /*end for */
 932
 933       end:
 934         *a_in_len = in_index;
 935         *a_out_len = out_index;
 936
 937         return CR_OK;
 938 }
 939
 940 /**
 941  *Converts an ucs1 string into an utf8 string.
 942  *@param a_in_start the beginning of the input string to convert.
 943  *@param a_in_end the end of the input string to convert.
 944  *@param a_out out parameter. The converted string.
 945  *@param a_out out parameter. The length of the converted string.
 946  *@return CR_OK upon successfull completion, an error code otherwise.
 947  *
 948  */
 949 enum CRStatus
 950 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
 951                            gulong * a_in_len,
 952                            guchar ** a_out, gulong * a_out_len)
 953 {
 954         gulong in_len = 0,
 955                 out_len = 0;
 956         enum CRStatus status = CR_OK;
 957
 958         g_return_val_if_fail (a_in && a_in_len && a_out
 959                               && a_out_len, CR_BAD_PARAM_ERROR);
 960
 961         if (*a_in_len < 1) {
 962                 *a_out_len = 0;
 963                 *a_out = NULL;
 964                 return CR_OK;
 965         }
 966
 967         status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
 968                                                 &out_len);
 969
 970         g_return_val_if_fail (status == CR_OK, status);
 971
 972         in_len = *a_in_len;
 973
 974         *a_out = (guchar *) g_malloc0 (out_len);
 975
 976         status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
 977
 978         *a_out_len = out_len;
 979
 980         return status;
 981 }
 982
 983 /**
 984  *Converts an utf8 buffer into an ucs1 buffer.
 985  *The caller must know the size of the resulting
 986  *converted buffer, and allocated it prior to calling this
 987  *function.
 988  *
 989  *@param a_in the input utf8 buffer to convert.
 990  *
 991  *@param a_in_len in/out parameter. The size of the input utf8 buffer.
 992  *After return, points to the number of bytes consumed
 993  *by the function even in case of encoding error.
 994  *
 995  *@param a_out out parameter. Points to the resulting buffer.
 996  *Must be allocated by the caller. If the size of a_out is shorter
 997  *than its required size, this function converts what it can and return
 998  *a successfull status.
 999  *
1000  *@param a_out_len in/out parameter. The size of the output buffer.
1001  *After return, points to the number of bytes consumed even in case of
1002  *encoding error.
1003  *
1004  *@return CR_OK upon successfull completion, an error code otherwise.
1005  */
1006 enum CRStatus
1007 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1008                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1009 {
1010         gulong in_index = 0,
1011                 out_index = 0,
1012                 in_len = 0,
1013                 out_len = 0;
1014         enum CRStatus status = CR_OK;
1015
1016         /*
1017          *to store the final decoded
1018          *unicode char
1019          */
1020         guint32 c = 0;
1021
1022         g_return_val_if_fail (a_in && a_in_len
1023                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1024
1025         if (*a_in_len < 1) {
1026                 status = CR_OK;
1027                 goto end;
1028         }
1029
1030         in_len = *a_in_len;
1031         out_len = *a_out_len;
1032
1033         for (in_index = 0, out_index = 0;
1034              (in_index < in_len) && (out_index < out_len);
1035              in_index++, out_index++) {
1036                 gint nb_bytes_2_decode = 0;
1037
1038                 if (a_in[in_index] <= 0x7F) {
1039                         /*
1040                          *7 bits long char
1041                          *encoded over 1 byte:
1042                          * 0xxx xxxx
1043                          */
1044                         c = a_in[in_index];
1045                         nb_bytes_2_decode = 1;
1046
1047                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1048                         /*
1049                          *up to 11 bits long char.
1050                          *encoded over 2 bytes:
1051                          *110x xxxx  10xx xxxx
1052                          */
1053                         c = a_in[in_index] & 0x1F;
1054                         nb_bytes_2_decode = 2;
1055
1056                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1057                         /*
1058                          *up to 16 bit long char
1059                          *encoded over 3 bytes:
1060                          *1110 xxxx  10xx xxxx  10xx xxxx
1061                          */
1062                         c = a_in[in_index] & 0x0F;
1063                         nb_bytes_2_decode = 3;
1064
1065                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1066                         /*
1067                          *up to 21 bits long char
1068                          *encoded over 4 bytes:
1069                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
1070                          */
1071                         c = a_in[in_index] & 0x7;
1072                         nb_bytes_2_decode = 4;
1073
1074                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1075                         /*
1076                          *up to 26 bits long char
1077                          *encoded over 5 bytes.
1078                          *1111 10xx  10xx xxxx  10xx xxxx
1079                          *10xx xxxx  10xx xxxx
1080                          */
1081                         c = a_in[in_index] & 3;
1082                         nb_bytes_2_decode = 5;
1083
1084                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1085                         /*
1086                          *up to 31 bits long char
1087                          *encoded over 6 bytes:
1088                          *1111 110x  10xx xxxx  10xx xxxx
1089                          *10xx xxxx  10xx xxxx  10xx xxxx
1090                          */
1091                         c = a_in[in_index] & 1;
1092                         nb_bytes_2_decode = 6;
1093
1094                 } else {
1095                         /*BAD ENCODING */
1096                         status = CR_ENCODING_ERROR;
1097                         goto end;
1098                 }
1099
1100                 /*
1101                  *Go and decode the remaining byte(s)
1102                  *(if any) to get the current character.
1103                  */
1104                 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1105                         status = CR_OK;
1106                         goto end;
1107                 }
1108
1109                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1110                         /*decode the next byte */
1111                         in_index++;
1112
1113                         /*byte pattern must be: 10xx xxxx */
1114                         if ((a_in[in_index] & 0xC0) != 0x80) {
1115                                 status = CR_ENCODING_ERROR;
1116                                 goto end;
1117                         }
1118
1119                         c = (c << 6) | (a_in[in_index] & 0x3F);
1120                 }
1121
1122                 /*
1123                  *The decoded ucs4 char is now
1124                  *in c.
1125                  */
1126
1127                 if (c > 0xFF) {
1128                         status = CR_ENCODING_ERROR;
1129                         goto end;
1130                 }
1131
1132                 a_out[out_index] = c;
1133         }
1134
1135       end:
1136         *a_out_len = out_index;
1137         *a_in_len = in_index;
1138
1139         return CR_OK;
1140 }
1141
1142 /**
1143  *Converts an utf8 buffer into an
1144  *ucs1 buffer.
1145  *@param a_in_start the start of the input buffer.
1146  *@param a_in_end the end of the input buffer.
1147  *@param a_out out parameter. The resulting converted ucs4 buffer.
1148  *Must be freed by the caller.
1149  *@param a_out_len out parameter. The length of the converted buffer.
1150  *@return CR_OK upon successfull completion, an error code otherwise.
1151  *Note that out parameters are valid if and only if this function
1152  *returns CR_OK.
1153  */
1154 enum CRStatus
1155 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1156                            gulong * a_in_len,
1157                            guchar ** a_out, gulong * a_out_len)
1158 {
1159         enum CRStatus status = CR_OK;
1160
1161         g_return_val_if_fail (a_in && a_in_len
1162                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1163
1164         if (*a_in_len < 1) {
1165                 *a_out_len = 0;
1166                 *a_out = NULL;
1167                 return CR_OK;
1168         }
1169
1170         status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1171                                                 a_out_len);
1172
1173         g_return_val_if_fail (status == CR_OK, status);
1174
1175         *a_out = (guchar *) g_malloc0 (*a_out_len * sizeof (guint32));
1176
1177         status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1178         return status;
1179 }
1180
1181 /*****************************************
1182  *CSS basic types identification utilities
1183  *****************************************/
1184
1185 /**
1186  *Returns TRUE if a_char is a white space as
1187  *defined in the css spec in chap 4.1.1.
1188  *
1189  *white-space ::= ' '| \t|\r|\n|\f
1190  *
1191  *@param a_char the character to test.
1192  *return TRUE if is a white space, false otherwise.
1193  */
1194 gboolean
1195 cr_utils_is_white_space (guint32 a_char)
1196 {
1197         switch (a_char) {
1198         case ' ':
1199         case '\t':
1200         case '\r':
1201         case '\n':
1202         case '\f':
1203                 return TRUE;
1204                 break;
1205         default:
1206                 return FALSE;
1207         }
1208 }
1209
1210 /**
1211  *Returns true if the character is a newline
1212  *as defined in the css spec in the chap 4.1.1.
1213  *
1214  *nl ::= \n|\r\n|\r|\f
1215  *
1216  *@param a_char the character to test.
1217  *@return TRUE if the character is a newline, FALSE otherwise.
1218  */
1219 gboolean
1220 cr_utils_is_newline (guint32 a_char)
1221 {
1222         switch (a_char) {
1223         case '\n':
1224         case '\r':
1225         case '\f':
1226                 return TRUE;
1227                 break;
1228         default:
1229                 return FALSE;
1230         }
1231 }
1232
1233 /**
1234  *returns TRUE if the char is part of an hexa num char:
1235  *i.e hexa_char ::= [0-9A-F]
1236  */
1237 gboolean
1238 cr_utils_is_hexa_char (guint32 a_char)
1239 {
1240         if ((a_char >= '0' && a_char <= '9')
1241             || (a_char >= 'A' && a_char <= 'F')) {
1242                 return TRUE;
1243         }
1244         return FALSE;
1245 }
1246
1247 /**
1248  *Returns true if the character is a nonascii
1249  *character (as defined in the css spec chap 4.1.1):
1250  *
1251  *nonascii ::= [^\0-\177]
1252  *
1253  *@param a_char the character to test.
1254  *@return TRUE if the character is a nonascii char,
1255  *FALSE otherwise.
1256  */
1257 gboolean
1258 cr_utils_is_nonascii (guint32 a_char)
1259 {
1260         if (a_char <= 177) {
1261                 return FALSE;
1262         }
1263
1264         return TRUE;
1265 }
1266
1267 /**
1268  *Dumps a character a_nb times on a file.
1269  *@param a_char the char to dump
1270  *@param a_fp the destination file pointer
1271  *@param a_nb the number of times a_char is to be dumped.
1272  */
1273 void
1274 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1275 {
1276         glong i = 0;
1277
1278         for (i = 0; i < a_nb; i++) {
1279                 fprintf (a_fp, "%c", a_char);
1280         }
1281 }
1282
1283 void
1284 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1285 {
1286         glong i = 0;
1287
1288         g_return_if_fail (a_string);
1289
1290         for (i = 0; i < a_nb; i++) {
1291                 g_string_append_printf (a_string, "%c", a_char);
1292         }
1293 }
1294
1295 /**
1296  *Duplicates a list of GString instances.
1297  *@return the duplicated list of GString instances or NULL if
1298  *something bad happened.
1299  *@param a_list_of_strings the list of strings to be duplicated.
1300  */
1301 GList *
1302 cr_utils_dup_glist_of_string (GList * a_list_of_strings)
1303 {
1304         GList *cur = NULL,
1305                 *result = NULL;
1306
1307         g_return_val_if_fail (a_list_of_strings, NULL);
1308
1309         for (cur = a_list_of_strings; cur; cur = cur->next) {
1310                 GString *str = NULL;
1311
1312                 str = g_string_new_len (((GString *) cur->data)->str,
1313                                         ((GString *) cur->data)->len);
1314                 if (str)
1315                         result = g_list_append (result, str);
1316         }
1317
1318         return result;
1319 }
1320
1321 /**
1322  *Duplicate a GList where the GList::data is a CRString.
1323  *@param a_list_of_strings the list to duplicate
1324  *@return the duplicated list, or NULL if something bad
1325  *happened.
1326  */
1327 GList *
1328 cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
1329 {
1330         GList *cur = NULL, *result = NULL;
1331
1332         g_return_val_if_fail (a_list_of_strings, NULL);
1333
1334         for (cur = a_list_of_strings; cur; cur = cur->next) {
1335                 CRString *str = NULL;
1336
1337                 str = cr_string_dup ((CRString *) cur->data) ;
1338                 if (str)
1339                         result = g_list_append (result, str);
1340         }
1341
1342         return result;
1343 }