1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
3 /*
4 * This file is part of The Croco Library
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2.1 of the GNU Lesser General Public
8 * License as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 *
20 * Author: Dodji Seketeli
21 * See COPYRIGHTS file for copyright information.
22 */
24 #include "cr-utils.h"
25 #include "cr-string.h"
27 /**
28 *@file:
29 *Some misc utility functions used
30 *in the libcroco.
31 *Note that troughout this file I will
32 *refer to the CSS SPECIFICATIONS DOCUMENTATION
33 *written by the w3c guys. You can find that document
34 *at http://www.w3.org/TR/REC-CSS2/ .
35 */
37 /****************************
38 *Encoding transformations and
39 *encoding helpers
40 ****************************/
42 /*
43 *Here is the correspondance between the ucs-4 charactere codes
44 *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
45 *
46 *UCS-4 range (hex.) UTF-8 octet sequence (binary)
47 *------------------ -----------------------------
48 *0000 0000-0000 007F 0xxxxxxx
49 *0000 0080-0000 07FF 110xxxxx 10xxxxxx
50 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
51 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
52 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
53 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
54 */
56 /**
57 *Given an utf8 string buffer, calculates
58 *the length of this string if it was encoded
59 *in ucs4.
60 *@param a_in_start a pointer to the begining of
61 *the input utf8 string.
62 *@param a_in_end a pointre to the end of the input
63 *utf8 string (points to the last byte of the buffer)
64 *@param a_len out parameter the calculated length.
65 *@return CR_OK upon succesfull completion, an error code
66 *otherwise.
67 */
68 enum CRStatus
69 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
70 const guchar * a_in_end, gulong * a_len)
71 {
72 guchar *byte_ptr = NULL;
73 gint len = 0;
75 /*
76 *to store the final decoded
77 *unicode char
78 */
79 guint c = 0;
81 g_return_val_if_fail (a_in_start && a_in_end && a_len,
82 CR_BAD_PARAM_ERROR);
83 *a_len = 0;
85 for (byte_ptr = (guchar *) a_in_start;
86 byte_ptr <= a_in_end; byte_ptr++) {
87 gint nb_bytes_2_decode = 0;
89 if (*byte_ptr <= 0x7F) {
90 /*
91 *7 bits long char
92 *encoded over 1 byte:
93 * 0xxx xxxx
94 */
95 c = *byte_ptr;
96 nb_bytes_2_decode = 1;
98 } else if ((*byte_ptr & 0xE0) == 0xC0) {
99 /*
100 *up to 11 bits long char.
101 *encoded over 2 bytes:
102 *110x xxxx 10xx xxxx
103 */
104 c = *byte_ptr & 0x1F;
105 nb_bytes_2_decode = 2;
107 } else if ((*byte_ptr & 0xF0) == 0xE0) {
108 /*
109 *up to 16 bit long char
110 *encoded over 3 bytes:
111 *1110 xxxx 10xx xxxx 10xx xxxx
112 */
113 c = *byte_ptr & 0x0F;
114 nb_bytes_2_decode = 3;
116 } else if ((*byte_ptr & 0xF8) == 0xF0) {
117 /*
118 *up to 21 bits long char
119 *encoded over 4 bytes:
120 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
121 */
122 c = *byte_ptr & 0x7;
123 nb_bytes_2_decode = 4;
125 } else if ((*byte_ptr & 0xFC) == 0xF8) {
126 /*
127 *up to 26 bits long char
128 *encoded over 5 bytes.
129 *1111 10xx 10xx xxxx 10xx xxxx
130 *10xx xxxx 10xx xxxx
131 */
132 c = *byte_ptr & 3;
133 nb_bytes_2_decode = 5;
135 } else if ((*byte_ptr & 0xFE) == 0xFC) {
136 /*
137 *up to 31 bits long char
138 *encoded over 6 bytes:
139 *1111 110x 10xx xxxx 10xx xxxx
140 *10xx xxxx 10xx xxxx 10xx xxxx
141 */
142 c = *byte_ptr & 1;
143 nb_bytes_2_decode = 6;
145 } else {
146 /*
147 *BAD ENCODING
148 */
149 return CR_ENCODING_ERROR;
150 }
152 /*
153 *Go and decode the remaining byte(s)
154 *(if any) to get the current character.
155 */
156 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
157 /*decode the next byte */
158 byte_ptr++;
160 /*byte pattern must be: 10xx xxxx */
161 if ((*byte_ptr & 0xC0) != 0x80) {
162 return CR_ENCODING_ERROR;
163 }
165 c = (c << 6) | (*byte_ptr & 0x3F);
166 }
168 len++;
169 }
171 *a_len = len;
173 return CR_OK;
174 }
176 /**
177 *Given an ucs4 string, this function
178 *returns the size (in bytes) this string
179 *would have occupied if it was encoded in utf-8.
180 *@param a_in_start a pointer to the beginning of the input
181 *buffer.
182 *@param a_in_end a pointer to the end of the input buffer.
183 *@param a_len out parameter. The computed length.
184 *@return CR_OK upon successfull completion, an error code otherwise.
185 */
186 enum CRStatus
187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
188 const guint32 * a_in_end, gulong * a_len)
189 {
190 gint len = 0;
191 guint32 *char_ptr = NULL;
193 g_return_val_if_fail (a_in_start && a_in_end && a_len,
194 CR_BAD_PARAM_ERROR);
196 for (char_ptr = (guint32 *) a_in_start;
197 char_ptr <= a_in_end; char_ptr++) {
198 if (*char_ptr <= 0x7F) {
199 /*the utf-8 char would take 1 byte */
200 len += 1;
201 } else if (*char_ptr <= 0x7FF) {
202 /*the utf-8 char would take 2 bytes */
203 len += 2;
204 } else if (*char_ptr <= 0xFFFF) {
205 len += 3;
206 } else if (*char_ptr <= 0x1FFFFF) {
207 len += 4;
208 } else if (*char_ptr <= 0x3FFFFFF) {
209 len += 5;
210 } else if (*char_ptr <= 0x7FFFFFFF) {
211 len += 6;
212 }
213 }
215 *a_len = len;
216 return CR_OK;
217 }
219 /**
220 *Given an ucsA string, this function
221 *returns the size (in bytes) this string
222 *would have occupied if it was encoded in utf-8.
223 *@param a_in_start a pointer to the beginning of the input
224 *buffer.
225 *@param a_in_end a pointer to the end of the input buffer.
226 *@param a_len out parameter. The computed length.
227 *@return CR_OK upon successfull completion, an error code otherwise.
228 */
229 enum CRStatus
230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
231 const guchar * a_in_end, gulong * a_len)
232 {
233 gint len = 0;
234 guchar *char_ptr = NULL;
236 g_return_val_if_fail (a_in_start && a_in_end && a_len,
237 CR_BAD_PARAM_ERROR);
239 for (char_ptr = (guchar *) a_in_start;
240 char_ptr <= a_in_end; char_ptr++) {
241 if (*char_ptr <= 0x7F) {
242 /*the utf-8 char would take 1 byte */
243 len += 1;
244 } else {
245 /*the utf-8 char would take 2 bytes */
246 len += 2;
247 }
248 }
250 *a_len = len;
251 return CR_OK;
252 }
254 /**
255 *Converts an utf8 buffer into an ucs4 buffer.
256 *
257 *@param a_in the input utf8 buffer to convert.
258 *@param a_in_len in/out parameter. The size of the
259 *input buffer to convert. After return, this parameter contains
260 *the actual number of bytes consumed.
261 *@param a_out the output converted ucs4 buffer. Must be allocated by
262 *the caller.
263 *@param a_out_len in/out parameter. The size of the output buffer.
264 *If this size is actually smaller than the real needed size, the function
265 *just converts what it can and returns a success status. After return,
266 *this param points to the actual number of characters decoded.
267 *@return CR_OK upon successfull completion, an error code otherwise.
268 */
269 enum CRStatus
270 cr_utils_utf8_to_ucs4 (const guchar * a_in,
271 gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
272 {
273 gulong in_len = 0,
274 out_len = 0,
275 in_index = 0,
276 out_index = 0;
277 enum CRStatus status = CR_OK;
279 /*
280 *to store the final decoded
281 *unicode char
282 */
283 guint c = 0;
285 g_return_val_if_fail (a_in && a_in_len
286 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
288 if (*a_in_len < 1) {
289 status = CR_OK;
290 goto end;
291 }
293 in_len = *a_in_len;
294 out_len = *a_out_len;
296 for (in_index = 0, out_index = 0;
297 (in_index < in_len) && (out_index < out_len);
298 in_index++, out_index++) {
299 gint nb_bytes_2_decode = 0;
301 if (a_in[in_index] <= 0x7F) {
302 /*
303 *7 bits long char
304 *encoded over 1 byte:
305 * 0xxx xxxx
306 */
307 c = a_in[in_index];
308 nb_bytes_2_decode = 1;
310 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
311 /*
312 *up to 11 bits long char.
313 *encoded over 2 bytes:
314 *110x xxxx 10xx xxxx
315 */
316 c = a_in[in_index] & 0x1F;
317 nb_bytes_2_decode = 2;
319 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
320 /*
321 *up to 16 bit long char
322 *encoded over 3 bytes:
323 *1110 xxxx 10xx xxxx 10xx xxxx
324 */
325 c = a_in[in_index] & 0x0F;
326 nb_bytes_2_decode = 3;
328 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
329 /*
330 *up to 21 bits long char
331 *encoded over 4 bytes:
332 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
333 */
334 c = a_in[in_index] & 0x7;
335 nb_bytes_2_decode = 4;
337 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
338 /*
339 *up to 26 bits long char
340 *encoded over 5 bytes.
341 *1111 10xx 10xx xxxx 10xx xxxx
342 *10xx xxxx 10xx xxxx
343 */
344 c = a_in[in_index] & 3;
345 nb_bytes_2_decode = 5;
347 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
348 /*
349 *up to 31 bits long char
350 *encoded over 6 bytes:
351 *1111 110x 10xx xxxx 10xx xxxx
352 *10xx xxxx 10xx xxxx 10xx xxxx
353 */
354 c = a_in[in_index] & 1;
355 nb_bytes_2_decode = 6;
357 } else {
358 /*BAD ENCODING */
359 goto end;
360 }
362 /*
363 *Go and decode the remaining byte(s)
364 *(if any) to get the current character.
365 */
366 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
367 /*decode the next byte */
368 in_index++;
370 /*byte pattern must be: 10xx xxxx */
371 if ((a_in[in_index] & 0xC0) != 0x80) {
372 goto end;
373 }
375 c = (c << 6) | (a_in[in_index] & 0x3F);
376 }
378 /*
379 *The decoded ucs4 char is now
380 *in c.
381 */
383 /************************
384 *Some security tests
385 ***********************/
387 /*be sure c is a char */
388 if (c == 0xFFFF || c == 0xFFFE)
389 goto end;
391 /*be sure c is inferior to the max ucs4 char value */
392 if (c > 0x10FFFF)
393 goto end;
395 /*
396 *c must be less than UTF16 "lower surrogate begin"
397 *or higher than UTF16 "High surrogate end"
398 */
399 if (c >= 0xD800 && c <= 0xDFFF)
400 goto end;
402 /*Avoid characters that equals zero */
403 if (c == 0)
404 goto end;
406 a_out[out_index] = c;
407 }
409 end:
410 *a_out_len = out_index + 1;
411 *a_in_len = in_index + 1;
413 return status;
414 }
416 /**
417 *Reads a character from an utf8 buffer.
418 *Actually decode the next character code (unicode character code)
419 *and returns it.
420 *@param a_in the starting address of the utf8 buffer.
421 *@param a_in_len the length of the utf8 buffer.
422 *@param a_out output parameter. The resulting read char.
423 *@param a_consumed the number of the bytes consumed to
424 *decode the returned character code.
425 *@return CR_OK upon successfull completion, an error code otherwise.
426 */
427 enum CRStatus
428 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
429 gulong a_in_len,
430 guint32 * a_out, gulong * a_consumed)
431 {
432 gulong in_len = 0,
433 in_index = 0,
434 nb_bytes_2_decode = 0;
435 enum CRStatus status = CR_OK;
437 /*
438 *to store the final decoded
439 *unicode char
440 */
441 guint32 c = 0;
443 g_return_val_if_fail (a_in && a_out && a_out
444 && a_consumed, CR_BAD_PARAM_ERROR);
446 if (a_in_len < 1) {
447 status = CR_OK;
448 goto end;
449 }
451 in_len = a_in_len;
453 if (*a_in <= 0x7F) {
454 /*
455 *7 bits long char
456 *encoded over 1 byte:
457 * 0xxx xxxx
458 */
459 c = *a_in;
460 nb_bytes_2_decode = 1;
462 } else if ((*a_in & 0xE0) == 0xC0) {
463 /*
464 *up to 11 bits long char.
465 *encoded over 2 bytes:
466 *110x xxxx 10xx xxxx
467 */
468 c = *a_in & 0x1F;
469 nb_bytes_2_decode = 2;
471 } else if ((*a_in & 0xF0) == 0xE0) {
472 /*
473 *up to 16 bit long char
474 *encoded over 3 bytes:
475 *1110 xxxx 10xx xxxx 10xx xxxx
476 */
477 c = *a_in & 0x0F;
478 nb_bytes_2_decode = 3;
480 } else if ((*a_in & 0xF8) == 0xF0) {
481 /*
482 *up to 21 bits long char
483 *encoded over 4 bytes:
484 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
485 */
486 c = *a_in & 0x7;
487 nb_bytes_2_decode = 4;
489 } else if ((*a_in & 0xFC) == 0xF8) {
490 /*
491 *up to 26 bits long char
492 *encoded over 5 bytes.
493 *1111 10xx 10xx xxxx 10xx xxxx
494 *10xx xxxx 10xx xxxx
495 */
496 c = *a_in & 3;
497 nb_bytes_2_decode = 5;
499 } else if ((*a_in & 0xFE) == 0xFC) {
500 /*
501 *up to 31 bits long char
502 *encoded over 6 bytes:
503 *1111 110x 10xx xxxx 10xx xxxx
504 *10xx xxxx 10xx xxxx 10xx xxxx
505 */
506 c = *a_in & 1;
507 nb_bytes_2_decode = 6;
509 } else {
510 /*BAD ENCODING */
511 goto end;
512 }
514 if (nb_bytes_2_decode > a_in_len) {
515 status = CR_END_OF_INPUT_ERROR;
516 goto end;
517 }
519 /*
520 *Go and decode the remaining byte(s)
521 *(if any) to get the current character.
522 */
523 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
524 /*byte pattern must be: 10xx xxxx */
525 if ((a_in[in_index] & 0xC0) != 0x80) {
526 goto end;
527 }
529 c = (c << 6) | (a_in[in_index] & 0x3F);
530 }
532 /*
533 *The decoded ucs4 char is now
534 *in c.
535 */
537 /************************
538 *Some security tests
539 ***********************/
541 /*be sure c is a char */
542 if (c == 0xFFFF || c == 0xFFFE)
543 goto end;
545 /*be sure c is inferior to the max ucs4 char value */
546 if (c > 0x10FFFF)
547 goto end;
549 /*
550 *c must be less than UTF16 "lower surrogate begin"
551 *or higher than UTF16 "High surrogate end"
552 */
553 if (c >= 0xD800 && c <= 0xDFFF)
554 goto end;
556 /*Avoid characters that equals zero */
557 if (c == 0)
558 goto end;
560 *a_out = c;
562 end:
563 *a_consumed = nb_bytes_2_decode;
565 return status;
566 }
568 /**
569 *
570 */
571 enum CRStatus
572 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
573 const guchar * a_in_end, gulong * a_len)
574 {
575 /*
576 *Note: this function can be made shorter
577 *but it considers all the cases of the utf8 encoding
578 *to ease further extensions ...
579 */
581 guchar *byte_ptr = NULL;
582 gint len = 0;
584 /*
585 *to store the final decoded
586 *unicode char
587 */
588 guint c = 0;
590 g_return_val_if_fail (a_in_start && a_in_end && a_len,
591 CR_BAD_PARAM_ERROR);
592 *a_len = 0;
594 for (byte_ptr = (guchar *) a_in_start;
595 byte_ptr <= a_in_end; byte_ptr++) {
596 gint nb_bytes_2_decode = 0;
598 if (*byte_ptr <= 0x7F) {
599 /*
600 *7 bits long char
601 *encoded over 1 byte:
602 * 0xxx xxxx
603 */
604 c = *byte_ptr;
605 nb_bytes_2_decode = 1;
607 } else if ((*byte_ptr & 0xE0) == 0xC0) {
608 /*
609 *up to 11 bits long char.
610 *encoded over 2 bytes:
611 *110x xxxx 10xx xxxx
612 */
613 c = *byte_ptr & 0x1F;
614 nb_bytes_2_decode = 2;
616 } else if ((*byte_ptr & 0xF0) == 0xE0) {
617 /*
618 *up to 16 bit long char
619 *encoded over 3 bytes:
620 *1110 xxxx 10xx xxxx 10xx xxxx
621 */
622 c = *byte_ptr & 0x0F;
623 nb_bytes_2_decode = 3;
625 } else if ((*byte_ptr & 0xF8) == 0xF0) {
626 /*
627 *up to 21 bits long char
628 *encoded over 4 bytes:
629 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
630 */
631 c = *byte_ptr & 0x7;
632 nb_bytes_2_decode = 4;
634 } else if ((*byte_ptr & 0xFC) == 0xF8) {
635 /*
636 *up to 26 bits long char
637 *encoded over 5 bytes.
638 *1111 10xx 10xx xxxx 10xx xxxx
639 *10xx xxxx 10xx xxxx
640 */
641 c = *byte_ptr & 3;
642 nb_bytes_2_decode = 5;
644 } else if ((*byte_ptr & 0xFE) == 0xFC) {
645 /*
646 *up to 31 bits long char
647 *encoded over 6 bytes:
648 *1111 110x 10xx xxxx 10xx xxxx
649 *10xx xxxx 10xx xxxx 10xx xxxx
650 */
651 c = *byte_ptr & 1;
652 nb_bytes_2_decode = 6;
654 } else {
655 /*
656 *BAD ENCODING
657 */
658 return CR_ENCODING_ERROR;
659 }
661 /*
662 *Go and decode the remaining byte(s)
663 *(if any) to get the current character.
664 */
665 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
666 /*decode the next byte */
667 byte_ptr++;
669 /*byte pattern must be: 10xx xxxx */
670 if ((*byte_ptr & 0xC0) != 0x80) {
671 return CR_ENCODING_ERROR;
672 }
674 c = (c << 6) | (*byte_ptr & 0x3F);
675 }
677 /*
678 *The decoded ucs4 char is now
679 *in c.
680 */
682 if (c <= 0xFF) { /*Add other conditions to support
683 *other char sets (ucs2, ucs3, ucs4).
684 */
685 len++;
686 } else {
687 /*the char is too long to fit
688 *into the supposed charset len.
689 */
690 return CR_ENCODING_ERROR;
691 }
692 }
694 *a_len = len;
696 return CR_OK;
697 }
699 /**
700 *Converts an utf8 string into an ucs4 string.
701 *@param a_in the input string to convert.
702 *@param a_in_len in/out parameter. The length of the input
703 *string. After return, points to the actual number of bytes
704 *consumed. This can be usefull to debug the input stream in case
705 *of encoding error.
706 *@param a_out out parameter. Points to the output string. It is allocated
707 *by this function and must be freed by the caller.
708 *@param a_out_len out parameter. The length of the output string.
709 *@return CR_OK upon successfull completion, an error code otherwise.
710 *
711 */
712 enum CRStatus
713 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
714 gulong * a_in_len,
715 guint32 ** a_out, gulong * a_out_len)
716 {
717 enum CRStatus status = CR_OK;
719 g_return_val_if_fail (a_in && a_in_len
720 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
722 status = cr_utils_utf8_str_len_as_ucs4 (a_in,
723 &a_in[*a_in_len - 1],
724 a_out_len);
726 g_return_val_if_fail (status == CR_OK, status);
728 *a_out = (guint32 *) g_malloc0 (*a_out_len * sizeof (guint32));
730 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
732 return status;
733 }
735 /**
736 *Converts an ucs4 buffer into an utf8 buffer.
737 *
738 *@param a_in the input ucs4 buffer to convert.
739 *@param a_in_len in/out parameter. The size of the
740 *input buffer to convert. After return, this parameter contains
741 *the actual number of characters consumed.
742 *@param a_out the output converted utf8 buffer. Must be allocated by
743 *the caller.
744 *@param a_out_len in/out parameter. The size of the output buffer.
745 *If this size is actually smaller than the real needed size, the function
746 *just converts what it can and returns a success status. After return,
747 *this param points to the actual number of bytes in the buffer.
748 *@return CR_OK upon successfull completion, an error code otherwise.
749 */
750 enum CRStatus
751 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
752 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
753 {
754 gulong in_len = 0,
755 in_index = 0,
756 out_index = 0;
757 enum CRStatus status = CR_OK;
759 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
760 CR_BAD_PARAM_ERROR);
762 if (*a_in_len < 1) {
763 status = CR_OK;
764 goto end;
765 }
767 in_len = *a_in_len;
769 for (in_index = 0; in_index < in_len; in_index++) {
770 /*
771 *FIXME: return whenever we encounter forbidden char values.
772 */
774 if (a_in[in_index] <= 0x7F) {
775 a_out[out_index] = a_in[in_index];
776 out_index++;
777 } else if (a_in[in_index] <= 0x7FF) {
778 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
779 a_out[out_index + 1] =
780 (0x80 | (a_in[in_index] & 0x3F));
781 out_index += 2;
782 } else if (a_in[in_index] <= 0xFFFF) {
783 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
784 a_out[out_index + 1] =
785 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
786 a_out[out_index + 2] =
787 (0x80 | (a_in[in_index] & 0x3F));
788 out_index += 3;
789 } else if (a_in[in_index] <= 0x1FFFFF) {
790 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
791 a_out[out_index + 1]
792 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
793 a_out[out_index + 2]
794 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
795 a_out[out_index + 3]
796 = (0x80 | (a_in[in_index] & 0x3F));
797 out_index += 4;
798 } else if (a_in[in_index] <= 0x3FFFFFF) {
799 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
800 a_out[out_index + 1] =
801 (0x80 | (a_in[in_index] >> 18));
802 a_out[out_index + 2]
803 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
804 a_out[out_index + 3]
805 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
806 a_out[out_index + 4]
807 = (0x80 | (a_in[in_index] & 0x3F));
808 out_index += 5;
809 } else if (a_in[in_index] <= 0x7FFFFFFF) {
810 a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
811 a_out[out_index + 1] =
812 (0x80 | (a_in[in_index] >> 24));
813 a_out[out_index + 2]
814 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
815 a_out[out_index + 3]
816 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
817 a_out[out_index + 4]
818 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
819 a_out[out_index + 4]
820 = (0x80 | (a_in[in_index] & 0x3F));
821 out_index += 6;
822 } else {
823 status = CR_ENCODING_ERROR;
824 goto end;
825 }
826 } /*end for */
828 end:
829 *a_in_len = in_index + 1;
830 *a_out_len = out_index + 1;
832 return status;
833 }
835 /**
836 *Converts an ucs4 string into an utf8 string.
837 *@param a_in the input string to convert.
838 *@param a_in_len in/out parameter. The length of the input
839 *string. After return, points to the actual number of characters
840 *consumed. This can be usefull to debug the input string in case
841 *of encoding error.
842 *@param a_out out parameter. Points to the output string. It is allocated
843 *by this function and must be freed by the caller.
844 *@param a_out_len out parameter. The length (in bytes) of the output string.
845 *@return CR_OK upon successfull completion, an error code otherwise.
846 */
847 enum CRStatus
848 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
849 gulong * a_in_len,
850 guchar ** a_out, gulong * a_out_len)
851 {
852 enum CRStatus status = CR_OK;
854 g_return_val_if_fail (a_in && a_in_len && a_out
855 && a_out_len, CR_BAD_PARAM_ERROR);
857 status = cr_utils_ucs4_str_len_as_utf8 (a_in,
858 &a_in[*a_out_len - 1],
859 a_out_len);
861 g_return_val_if_fail (status == CR_OK, status);
863 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
865 return status;
866 }
868 /**
869 *Converts an ucs1 buffer into an utf8 buffer.
870 *The caller must know the size of the resulting buffer and
871 *allocate it prior to calling this function.
872 *
873 *@param a_in the input ucs1 buffer.
874 *
875 *@param a_in_len in/out parameter. The length of the input buffer.
876 *After return, points to the number of bytes actually consumed even
877 *in case of encoding error.
878 *
879 *@param a_out out parameter. The output utf8 converted buffer.
880 *
881 *@param a_out_len in/out parameter. The size of the output buffer.
882 *If the output buffer size is shorter than the actual needed size,
883 *this function just convert what it can.
884 *
885 *@return CR_OK upon successfull completion, an error code otherwise.
886 *
887 */
888 enum CRStatus
889 cr_utils_ucs1_to_utf8 (const guchar * a_in,
890 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
891 {
892 gulong out_index = 0,
893 in_index = 0,
894 in_len = 0,
895 out_len = 0;
896 enum CRStatus status = CR_OK;
898 g_return_val_if_fail (a_in && a_in_len
899 && a_out_len,
900 CR_BAD_PARAM_ERROR);
902 if (*a_in_len == 0) {
903 *a_out_len = 0 ;
904 return CR_OK ;
905 }
906 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
908 if (*a_in_len < 1) {
909 status = CR_OK;
910 goto end;
911 }
913 in_len = *a_in_len;
914 out_len = *a_out_len;
916 for (in_index = 0, out_index = 0;
917 (in_index < in_len) && (out_index < out_len); in_index++) {
918 /*
919 *FIXME: return whenever we encounter forbidden char values.
920 */
922 if (a_in[in_index] <= 0x7F) {
923 a_out[out_index] = a_in[in_index];
924 out_index++;
925 } else {
926 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
927 a_out[out_index + 1] =
928 (0x80 | (a_in[in_index] & 0x3F));
929 out_index += 2;
930 }
931 } /*end for */
933 end:
934 *a_in_len = in_index;
935 *a_out_len = out_index;
937 return CR_OK;
938 }
940 /**
941 *Converts an ucs1 string into an utf8 string.
942 *@param a_in_start the beginning of the input string to convert.
943 *@param a_in_end the end of the input string to convert.
944 *@param a_out out parameter. The converted string.
945 *@param a_out out parameter. The length of the converted string.
946 *@return CR_OK upon successfull completion, an error code otherwise.
947 *
948 */
949 enum CRStatus
950 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
951 gulong * a_in_len,
952 guchar ** a_out, gulong * a_out_len)
953 {
954 gulong in_len = 0,
955 out_len = 0;
956 enum CRStatus status = CR_OK;
958 g_return_val_if_fail (a_in && a_in_len && a_out
959 && a_out_len, CR_BAD_PARAM_ERROR);
961 if (*a_in_len < 1) {
962 *a_out_len = 0;
963 *a_out = NULL;
964 return CR_OK;
965 }
967 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
968 &out_len);
970 g_return_val_if_fail (status == CR_OK, status);
972 in_len = *a_in_len;
974 *a_out = (guchar *) g_malloc0 (out_len);
976 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
978 *a_out_len = out_len;
980 return status;
981 }
983 /**
984 *Converts an utf8 buffer into an ucs1 buffer.
985 *The caller must know the size of the resulting
986 *converted buffer, and allocated it prior to calling this
987 *function.
988 *
989 *@param a_in the input utf8 buffer to convert.
990 *
991 *@param a_in_len in/out parameter. The size of the input utf8 buffer.
992 *After return, points to the number of bytes consumed
993 *by the function even in case of encoding error.
994 *
995 *@param a_out out parameter. Points to the resulting buffer.
996 *Must be allocated by the caller. If the size of a_out is shorter
997 *than its required size, this function converts what it can and return
998 *a successfull status.
999 *
1000 *@param a_out_len in/out parameter. The size of the output buffer.
1001 *After return, points to the number of bytes consumed even in case of
1002 *encoding error.
1003 *
1004 *@return CR_OK upon successfull completion, an error code otherwise.
1005 */
1006 enum CRStatus
1007 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1008 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1009 {
1010 gulong in_index = 0,
1011 out_index = 0,
1012 in_len = 0,
1013 out_len = 0;
1014 enum CRStatus status = CR_OK;
1016 /*
1017 *to store the final decoded
1018 *unicode char
1019 */
1020 guint32 c = 0;
1022 g_return_val_if_fail (a_in && a_in_len
1023 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1025 if (*a_in_len < 1) {
1026 status = CR_OK;
1027 goto end;
1028 }
1030 in_len = *a_in_len;
1031 out_len = *a_out_len;
1033 for (in_index = 0, out_index = 0;
1034 (in_index < in_len) && (out_index < out_len);
1035 in_index++, out_index++) {
1036 gint nb_bytes_2_decode = 0;
1038 if (a_in[in_index] <= 0x7F) {
1039 /*
1040 *7 bits long char
1041 *encoded over 1 byte:
1042 * 0xxx xxxx
1043 */
1044 c = a_in[in_index];
1045 nb_bytes_2_decode = 1;
1047 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1048 /*
1049 *up to 11 bits long char.
1050 *encoded over 2 bytes:
1051 *110x xxxx 10xx xxxx
1052 */
1053 c = a_in[in_index] & 0x1F;
1054 nb_bytes_2_decode = 2;
1056 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1057 /*
1058 *up to 16 bit long char
1059 *encoded over 3 bytes:
1060 *1110 xxxx 10xx xxxx 10xx xxxx
1061 */
1062 c = a_in[in_index] & 0x0F;
1063 nb_bytes_2_decode = 3;
1065 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1066 /*
1067 *up to 21 bits long char
1068 *encoded over 4 bytes:
1069 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
1070 */
1071 c = a_in[in_index] & 0x7;
1072 nb_bytes_2_decode = 4;
1074 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1075 /*
1076 *up to 26 bits long char
1077 *encoded over 5 bytes.
1078 *1111 10xx 10xx xxxx 10xx xxxx
1079 *10xx xxxx 10xx xxxx
1080 */
1081 c = a_in[in_index] & 3;
1082 nb_bytes_2_decode = 5;
1084 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1085 /*
1086 *up to 31 bits long char
1087 *encoded over 6 bytes:
1088 *1111 110x 10xx xxxx 10xx xxxx
1089 *10xx xxxx 10xx xxxx 10xx xxxx
1090 */
1091 c = a_in[in_index] & 1;
1092 nb_bytes_2_decode = 6;
1094 } else {
1095 /*BAD ENCODING */
1096 status = CR_ENCODING_ERROR;
1097 goto end;
1098 }
1100 /*
1101 *Go and decode the remaining byte(s)
1102 *(if any) to get the current character.
1103 */
1104 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1105 status = CR_OK;
1106 goto end;
1107 }
1109 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1110 /*decode the next byte */
1111 in_index++;
1113 /*byte pattern must be: 10xx xxxx */
1114 if ((a_in[in_index] & 0xC0) != 0x80) {
1115 status = CR_ENCODING_ERROR;
1116 goto end;
1117 }
1119 c = (c << 6) | (a_in[in_index] & 0x3F);
1120 }
1122 /*
1123 *The decoded ucs4 char is now
1124 *in c.
1125 */
1127 if (c > 0xFF) {
1128 status = CR_ENCODING_ERROR;
1129 goto end;
1130 }
1132 a_out[out_index] = c;
1133 }
1135 end:
1136 *a_out_len = out_index;
1137 *a_in_len = in_index;
1139 return CR_OK;
1140 }
1142 /**
1143 *Converts an utf8 buffer into an
1144 *ucs1 buffer.
1145 *@param a_in_start the start of the input buffer.
1146 *@param a_in_end the end of the input buffer.
1147 *@param a_out out parameter. The resulting converted ucs4 buffer.
1148 *Must be freed by the caller.
1149 *@param a_out_len out parameter. The length of the converted buffer.
1150 *@return CR_OK upon successfull completion, an error code otherwise.
1151 *Note that out parameters are valid if and only if this function
1152 *returns CR_OK.
1153 */
1154 enum CRStatus
1155 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1156 gulong * a_in_len,
1157 guchar ** a_out, gulong * a_out_len)
1158 {
1159 enum CRStatus status = CR_OK;
1161 g_return_val_if_fail (a_in && a_in_len
1162 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1164 if (*a_in_len < 1) {
1165 *a_out_len = 0;
1166 *a_out = NULL;
1167 return CR_OK;
1168 }
1170 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1171 a_out_len);
1173 g_return_val_if_fail (status == CR_OK, status);
1175 *a_out = (guchar *) g_malloc0 (*a_out_len * sizeof (guint32));
1177 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1178 return status;
1179 }
1181 /*****************************************
1182 *CSS basic types identification utilities
1183 *****************************************/
1185 /**
1186 *Returns TRUE if a_char is a white space as
1187 *defined in the css spec in chap 4.1.1.
1188 *
1189 *white-space ::= ' '| \t|\r|\n|\f
1190 *
1191 *@param a_char the character to test.
1192 *return TRUE if is a white space, false otherwise.
1193 */
1194 gboolean
1195 cr_utils_is_white_space (guint32 a_char)
1196 {
1197 switch (a_char) {
1198 case ' ':
1199 case '\t':
1200 case '\r':
1201 case '\n':
1202 case '\f':
1203 return TRUE;
1204 break;
1205 default:
1206 return FALSE;
1207 }
1208 }
1210 /**
1211 *Returns true if the character is a newline
1212 *as defined in the css spec in the chap 4.1.1.
1213 *
1214 *nl ::= \n|\r\n|\r|\f
1215 *
1216 *@param a_char the character to test.
1217 *@return TRUE if the character is a newline, FALSE otherwise.
1218 */
1219 gboolean
1220 cr_utils_is_newline (guint32 a_char)
1221 {
1222 switch (a_char) {
1223 case '\n':
1224 case '\r':
1225 case '\f':
1226 return TRUE;
1227 break;
1228 default:
1229 return FALSE;
1230 }
1231 }
1233 /**
1234 *returns TRUE if the char is part of an hexa num char:
1235 *i.e hexa_char ::= [0-9A-F]
1236 */
1237 gboolean
1238 cr_utils_is_hexa_char (guint32 a_char)
1239 {
1240 if ((a_char >= '0' && a_char <= '9')
1241 || (a_char >= 'A' && a_char <= 'F')) {
1242 return TRUE;
1243 }
1244 return FALSE;
1245 }
1247 /**
1248 *Returns true if the character is a nonascii
1249 *character (as defined in the css spec chap 4.1.1):
1250 *
1251 *nonascii ::= [^\0-\177]
1252 *
1253 *@param a_char the character to test.
1254 *@return TRUE if the character is a nonascii char,
1255 *FALSE otherwise.
1256 */
1257 gboolean
1258 cr_utils_is_nonascii (guint32 a_char)
1259 {
1260 if (a_char <= 177) {
1261 return FALSE;
1262 }
1264 return TRUE;
1265 }
1267 /**
1268 *Dumps a character a_nb times on a file.
1269 *@param a_char the char to dump
1270 *@param a_fp the destination file pointer
1271 *@param a_nb the number of times a_char is to be dumped.
1272 */
1273 void
1274 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1275 {
1276 glong i = 0;
1278 for (i = 0; i < a_nb; i++) {
1279 fprintf (a_fp, "%c", a_char);
1280 }
1281 }
1283 void
1284 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1285 {
1286 glong i = 0;
1288 g_return_if_fail (a_string);
1290 for (i = 0; i < a_nb; i++) {
1291 g_string_append_printf (a_string, "%c", a_char);
1292 }
1293 }
1295 /**
1296 *Duplicates a list of GString instances.
1297 *@return the duplicated list of GString instances or NULL if
1298 *something bad happened.
1299 *@param a_list_of_strings the list of strings to be duplicated.
1300 */
1301 GList *
1302 cr_utils_dup_glist_of_string (GList * a_list_of_strings)
1303 {
1304 GList *cur = NULL,
1305 *result = NULL;
1307 g_return_val_if_fail (a_list_of_strings, NULL);
1309 for (cur = a_list_of_strings; cur; cur = cur->next) {
1310 GString *str = NULL;
1312 str = g_string_new_len (((GString *) cur->data)->str,
1313 ((GString *) cur->data)->len);
1314 if (str)
1315 result = g_list_append (result, str);
1316 }
1318 return result;
1319 }
1321 /**
1322 *Duplicate a GList where the GList::data is a CRString.
1323 *@param a_list_of_strings the list to duplicate
1324 *@return the duplicated list, or NULL if something bad
1325 *happened.
1326 */
1327 GList *
1328 cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
1329 {
1330 GList *cur = NULL, *result = NULL;
1332 g_return_val_if_fail (a_list_of_strings, NULL);
1334 for (cur = a_list_of_strings; cur; cur = cur->next) {
1335 CRString *str = NULL;
1337 str = cr_string_dup ((CRString *) cur->data) ;
1338 if (str)
1339 result = g_list_append (result, str);
1340 }
1342 return result;
1343 }