1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 *
3 * ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * The Original Code is Mozilla Communicator client code, released
17 * March 31, 1998.
18 *
19 * The Initial Developer of the Original Code is
20 * Netscape Communications Corporation.
21 * Portions created by the Initial Developer are Copyright (C) 1998
22 * the Initial Developer. All Rights Reserved.
23 *
24 * Contributor(s):
25 *
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
37 *
38 * ***** END LICENSE BLOCK ***** */
40 #ifndef jsstr_h___
41 #define jsstr_h___
42 /*
43 * JS string type implementation.
44 *
45 * A JS string is a counted array of unicode characters. To support handoff
46 * of API client memory, the chars are allocated separately from the length,
47 * necessitating a pointer after the count, to form a separately allocated
48 * string descriptor. String descriptors are GC'ed, while their chars are
49 * allocated from the malloc heap.
50 *
51 * When a string is treated as an object (by following it with . or []), the
52 * runtime wraps it with a JSObject whose valueOf method returns the unwrapped
53 * string descriptor.
54 */
55 #include <ctype.h>
56 #include "jspubtd.h"
57 #include "jsprvtd.h"
58 #include "jshash.h"
60 JS_BEGIN_EXTERN_C
62 /*
63 * The original GC-thing "string" type, a flat character string owned by its
64 * GC-thing descriptor. The chars member points to a vector having byte size
65 * (length + 1) * sizeof(jschar), terminated at index length by a zero jschar.
66 * The terminator is purely a backstop, in case the chars pointer flows out to
67 * native code that requires \u0000 termination.
68 *
69 * NB: Always use the JSSTRING_LENGTH and JSSTRING_CHARS accessor macros,
70 * unless you guard str->member uses with !JSSTRING_IS_DEPENDENT(str).
71 */
72 struct JSString {
73 size_t length;
74 jschar *chars;
75 };
77 /*
78 * Overlay structure for a string that depends on another string's characters.
79 * Distinguished by the JSSTRFLAG_DEPENDENT bit being set in length. The base
80 * member may point to another dependent string if JSSTRING_CHARS has not been
81 * called yet. The length chars in a dependent string are stored starting at
82 * base->chars + start, and are not necessarily zero-terminated. If start is
83 * 0, it is not stored, length is a full size_t (minus the JSSTRFLAG_* bits in
84 * the high two positions), and the JSSTRFLAG_PREFIX flag is set.
85 */
86 struct JSDependentString {
87 size_t length;
88 JSString *base;
89 };
91 /* Definitions for flags stored in the high order bits of JSString.length. */
92 #define JSSTRFLAG_BITS 2
93 #define JSSTRFLAG_SHIFT(flg) ((size_t)(flg) << JSSTRING_LENGTH_BITS)
94 #define JSSTRFLAG_MASK JSSTRFLAG_SHIFT(JS_BITMASK(JSSTRFLAG_BITS))
95 #define JSSTRFLAG_DEPENDENT JSSTRFLAG_SHIFT(1)
96 #define JSSTRFLAG_PREFIX JSSTRFLAG_SHIFT(2)
98 /* Universal JSString type inquiry and accessor macros. */
99 #define JSSTRING_BIT(n) ((size_t)1 << (n))
100 #define JSSTRING_BITMASK(n) (JSSTRING_BIT(n) - 1)
101 #define JSSTRING_HAS_FLAG(str,flg) ((str)->length & (flg))
102 #define JSSTRING_IS_DEPENDENT(str) JSSTRING_HAS_FLAG(str, JSSTRFLAG_DEPENDENT)
103 #define JSSTRING_IS_PREFIX(str) JSSTRING_HAS_FLAG(str, JSSTRFLAG_PREFIX)
104 #define JSSTRING_CHARS(str) (JSSTRING_IS_DEPENDENT(str) \
105 ? JSSTRDEP_CHARS(str) \
106 : (str)->chars)
107 #define JSSTRING_LENGTH(str) (JSSTRING_IS_DEPENDENT(str) \
108 ? JSSTRDEP_LENGTH(str) \
109 : (str)->length)
110 #define JSSTRING_LENGTH_BITS (sizeof(size_t) * JS_BITS_PER_BYTE \
111 - JSSTRFLAG_BITS)
112 #define JSSTRING_LENGTH_MASK JSSTRING_BITMASK(JSSTRING_LENGTH_BITS)
114 /* Specific JSDependentString shift/mask accessor and mutator macros. */
115 #define JSSTRDEP_START_BITS (JSSTRING_LENGTH_BITS-JSSTRDEP_LENGTH_BITS)
116 #define JSSTRDEP_START_SHIFT JSSTRDEP_LENGTH_BITS
117 #define JSSTRDEP_START_MASK JSSTRING_BITMASK(JSSTRDEP_START_BITS)
118 #define JSSTRDEP_LENGTH_BITS (JSSTRING_LENGTH_BITS / 2)
119 #define JSSTRDEP_LENGTH_MASK JSSTRING_BITMASK(JSSTRDEP_LENGTH_BITS)
121 #define JSSTRDEP(str) ((JSDependentString *)(str))
122 #define JSSTRDEP_START(str) (JSSTRING_IS_PREFIX(str) ? 0 \
123 : ((JSSTRDEP(str)->length \
124 >> JSSTRDEP_START_SHIFT) \
125 & JSSTRDEP_START_MASK))
126 #define JSSTRDEP_LENGTH(str) (JSSTRDEP(str)->length \
127 & (JSSTRING_IS_PREFIX(str) \
128 ? JSSTRING_LENGTH_MASK \
129 : JSSTRDEP_LENGTH_MASK))
131 #define JSSTRDEP_SET_START_AND_LENGTH(str,off,len) \
132 (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT \
133 | ((off) << JSSTRDEP_START_SHIFT) \
134 | (len))
135 #define JSPREFIX_SET_LENGTH(str,len) \
136 (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT | JSSTRFLAG_PREFIX | (len))
138 #define JSSTRDEP_BASE(str) (JSSTRDEP(str)->base)
139 #define JSSTRDEP_SET_BASE(str,bstr) (JSSTRDEP(str)->base = (bstr))
140 #define JSPREFIX_BASE(str) JSSTRDEP_BASE(str)
141 #define JSPREFIX_SET_BASE(str,bstr) JSSTRDEP_SET_BASE(str,bstr)
143 #define JSSTRDEP_CHARS(str) \
144 (JSSTRING_IS_DEPENDENT(JSSTRDEP_BASE(str)) \
145 ? js_GetDependentStringChars(str) \
146 : JSSTRDEP_BASE(str)->chars + JSSTRDEP_START(str))
148 extern size_t
149 js_MinimizeDependentStrings(JSString *str, int level, JSString **basep);
151 extern jschar *
152 js_GetDependentStringChars(JSString *str);
154 extern jschar *
155 js_GetStringChars(JSString *str);
157 extern JSString *
158 js_ConcatStrings(JSContext *cx, JSString *left, JSString *right);
160 extern const jschar *
161 js_UndependString(JSContext *cx, JSString *str);
163 struct JSSubString {
164 size_t length;
165 const jschar *chars;
166 };
168 extern jschar js_empty_ucstr[];
169 extern JSSubString js_EmptySubString;
171 /* Unicode character attribute lookup tables. */
172 extern const uint8 js_X[];
173 extern const uint8 js_Y[];
174 extern const uint32 js_A[];
176 /* Enumerated Unicode general category types. */
177 typedef enum JSCharType {
178 JSCT_UNASSIGNED = 0,
179 JSCT_UPPERCASE_LETTER = 1,
180 JSCT_LOWERCASE_LETTER = 2,
181 JSCT_TITLECASE_LETTER = 3,
182 JSCT_MODIFIER_LETTER = 4,
183 JSCT_OTHER_LETTER = 5,
184 JSCT_NON_SPACING_MARK = 6,
185 JSCT_ENCLOSING_MARK = 7,
186 JSCT_COMBINING_SPACING_MARK = 8,
187 JSCT_DECIMAL_DIGIT_NUMBER = 9,
188 JSCT_LETTER_NUMBER = 10,
189 JSCT_OTHER_NUMBER = 11,
190 JSCT_SPACE_SEPARATOR = 12,
191 JSCT_LINE_SEPARATOR = 13,
192 JSCT_PARAGRAPH_SEPARATOR = 14,
193 JSCT_CONTROL = 15,
194 JSCT_FORMAT = 16,
195 JSCT_PRIVATE_USE = 18,
196 JSCT_SURROGATE = 19,
197 JSCT_DASH_PUNCTUATION = 20,
198 JSCT_START_PUNCTUATION = 21,
199 JSCT_END_PUNCTUATION = 22,
200 JSCT_CONNECTOR_PUNCTUATION = 23,
201 JSCT_OTHER_PUNCTUATION = 24,
202 JSCT_MATH_SYMBOL = 25,
203 JSCT_CURRENCY_SYMBOL = 26,
204 JSCT_MODIFIER_SYMBOL = 27,
205 JSCT_OTHER_SYMBOL = 28
206 } JSCharType;
208 /* Character classifying and mapping macros, based on java.lang.Character. */
209 #define JS_CCODE(c) (js_A[js_Y[(js_X[(uint16)(c)>>6]<<6)|((c)&0x3F)]])
210 #define JS_CTYPE(c) (JS_CCODE(c) & 0x1F)
212 #define JS_ISALPHA(c) ((((1 << JSCT_UPPERCASE_LETTER) | \
213 (1 << JSCT_LOWERCASE_LETTER) | \
214 (1 << JSCT_TITLECASE_LETTER) | \
215 (1 << JSCT_MODIFIER_LETTER) | \
216 (1 << JSCT_OTHER_LETTER)) \
217 >> JS_CTYPE(c)) & 1)
219 #define JS_ISALNUM(c) ((((1 << JSCT_UPPERCASE_LETTER) | \
220 (1 << JSCT_LOWERCASE_LETTER) | \
221 (1 << JSCT_TITLECASE_LETTER) | \
222 (1 << JSCT_MODIFIER_LETTER) | \
223 (1 << JSCT_OTHER_LETTER) | \
224 (1 << JSCT_DECIMAL_DIGIT_NUMBER)) \
225 >> JS_CTYPE(c)) & 1)
227 /* A unicode letter, suitable for use in an identifier. */
228 #define JS_ISLETTER(c) ((((1 << JSCT_UPPERCASE_LETTER) | \
229 (1 << JSCT_LOWERCASE_LETTER) | \
230 (1 << JSCT_TITLECASE_LETTER) | \
231 (1 << JSCT_MODIFIER_LETTER) | \
232 (1 << JSCT_OTHER_LETTER) | \
233 (1 << JSCT_LETTER_NUMBER)) \
234 >> JS_CTYPE(c)) & 1)
236 /*
237 * 'IdentifierPart' from ECMA grammar, is Unicode letter or combining mark or
238 * digit or connector punctuation.
239 */
240 #define JS_ISIDPART(c) ((((1 << JSCT_UPPERCASE_LETTER) | \
241 (1 << JSCT_LOWERCASE_LETTER) | \
242 (1 << JSCT_TITLECASE_LETTER) | \
243 (1 << JSCT_MODIFIER_LETTER) | \
244 (1 << JSCT_OTHER_LETTER) | \
245 (1 << JSCT_LETTER_NUMBER) | \
246 (1 << JSCT_NON_SPACING_MARK) | \
247 (1 << JSCT_COMBINING_SPACING_MARK) | \
248 (1 << JSCT_DECIMAL_DIGIT_NUMBER) | \
249 (1 << JSCT_CONNECTOR_PUNCTUATION)) \
250 >> JS_CTYPE(c)) & 1)
252 /* Unicode control-format characters, ignored in input */
253 #define JS_ISFORMAT(c) (((1 << JSCT_FORMAT) >> JS_CTYPE(c)) & 1)
255 /*
256 * Per ECMA-262 15.10.2.6, these characters are the only ones that make up a
257 * "word", as far as a RegExp is concerned. If we want a Unicode-friendlier
258 * definition of "word", we should rename this macro to something regexp-y.
259 */
260 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
262 #define JS_ISIDSTART(c) (JS_ISLETTER(c) || (c) == '_' || (c) == '$')
263 #define JS_ISIDENT(c) (JS_ISIDPART(c) || (c) == '_' || (c) == '$')
265 #define JS_ISXMLSPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\r' || \
266 (c) == '\n')
267 #define JS_ISXMLNSSTART(c) ((JS_CCODE(c) & 0x00000100) || (c) == '_')
268 #define JS_ISXMLNS(c) ((JS_CCODE(c) & 0x00000080) || (c) == '.' || \
269 (c) == '-' || (c) == '_')
270 #define JS_ISXMLNAMESTART(c) (JS_ISXMLNSSTART(c) || (c) == ':')
271 #define JS_ISXMLNAME(c) (JS_ISXMLNS(c) || (c) == ':')
273 #define JS_ISDIGIT(c) (JS_CTYPE(c) == JSCT_DECIMAL_DIGIT_NUMBER)
275 /* XXXbe unify on A/X/Y tbls, avoid ctype.h? */
276 /* XXXbe fs, etc. ? */
277 #define JS_ISSPACE(c) ((JS_CCODE(c) & 0x00070000) == 0x00040000)
278 #define JS_ISPRINT(c) ((c) < 128 && isprint(c))
280 #define JS_ISUPPER(c) (JS_CTYPE(c) == JSCT_UPPERCASE_LETTER)
281 #define JS_ISLOWER(c) (JS_CTYPE(c) == JSCT_LOWERCASE_LETTER)
283 #define JS_TOUPPER(c) ((jschar) ((JS_CCODE(c) & 0x00100000) \
284 ? (c) - ((int32)JS_CCODE(c) >> 22) \
285 : (c)))
286 #define JS_TOLOWER(c) ((jschar) ((JS_CCODE(c) & 0x00200000) \
287 ? (c) + ((int32)JS_CCODE(c) >> 22) \
288 : (c)))
290 /* Shorthands for ASCII (7-bit) decimal and hex conversion. */
291 #define JS7_ISDEC(c) ((c) < 128 && isdigit(c))
292 #define JS7_UNDEC(c) ((c) - '0')
293 #define JS7_ISHEX(c) ((c) < 128 && isxdigit(c))
294 #define JS7_UNHEX(c) (uintN)(isdigit(c) ? (c) - '0' : 10 + tolower(c) - 'a')
295 #define JS7_ISLET(c) ((c) < 128 && isalpha(c))
297 /* Initialize truly global state associated with JS strings. */
298 extern JSBool
299 js_InitStringGlobals(void);
301 extern void
302 js_FreeStringGlobals(void);
304 extern void
305 js_PurgeDeflatedStringCache(JSString *str);
307 /* Initialize per-runtime string state for the first context in the runtime. */
308 extern JSBool
309 js_InitRuntimeStringState(JSContext *cx);
311 extern void
312 js_FinishRuntimeStringState(JSContext *cx);
314 /* Initialize the String class, returning its prototype object. */
315 extern JSClass js_StringClass;
317 extern JSObject *
318 js_InitStringClass(JSContext *cx, JSObject *obj);
320 extern const char js_escape_str[];
321 extern const char js_unescape_str[];
322 extern const char js_uneval_str[];
323 extern const char js_decodeURI_str[];
324 extern const char js_encodeURI_str[];
325 extern const char js_decodeURIComponent_str[];
326 extern const char js_encodeURIComponent_str[];
328 /* GC-allocate a string descriptor for the given malloc-allocated chars. */
329 extern JSString *
330 js_NewString(JSContext *cx, jschar *chars, size_t length, uintN gcflag);
332 extern JSString *
333 js_NewDependentString(JSContext *cx, JSString *base, size_t start,
334 size_t length, uintN gcflag);
336 /* Copy a counted string and GC-allocate a descriptor for it. */
337 extern JSString *
338 js_NewStringCopyN(JSContext *cx, const jschar *s, size_t n, uintN gcflag);
340 /* Copy a C string and GC-allocate a descriptor for it. */
341 extern JSString *
342 js_NewStringCopyZ(JSContext *cx, const jschar *s, uintN gcflag);
344 /* Free the chars held by str when it is finalized by the GC. */
345 extern void
346 js_FinalizeString(JSContext *cx, JSString *str);
348 extern void
349 js_FinalizeStringRT(JSRuntime *rt, JSString *str);
351 /* Wrap a string value in a String object. */
352 extern JSObject *
353 js_StringToObject(JSContext *cx, JSString *str);
355 /*
356 * Convert a value to a printable C string.
357 */
358 extern JS_FRIEND_API(const char *)
359 js_ValueToPrintableString(JSContext *cx, jsval v);
361 /*
362 * Convert a value to a string, returning null after reporting an error,
363 * otherwise returning a new string reference.
364 */
365 extern JSString *
366 js_ValueToString(JSContext *cx, jsval v);
368 /*
369 * Convert a value to its source expression, returning null after reporting
370 * an error, otherwise returning a new string reference.
371 */
372 extern JSString *
373 js_ValueToSource(JSContext *cx, jsval v);
375 #ifdef HT_ENUMERATE_NEXT /* XXX don't require jshash.h */
376 /*
377 * Compute a hash function from str.
378 */
379 extern JSHashNumber
380 js_HashString(JSString *str);
381 #endif
383 /*
384 * Return less than, equal to, or greater than zero depending on whether
385 * str1 is less than, equal to, or greater than str2.
386 */
387 extern intN
388 js_CompareStrings(JSString *str1, JSString *str2);
390 /*
391 * Boyer-Moore-Horspool superlinear search for pat:patlen in text:textlen.
392 * The patlen argument must be positive and no greater than BMH_PATLEN_MAX.
393 * The start argument tells where in text to begin the search.
394 *
395 * Return the index of pat in text, or -1 if not found.
396 */
397 #define BMH_CHARSET_SIZE 256 /* ISO-Latin-1 */
398 #define BMH_PATLEN_MAX 255 /* skip table element is uint8 */
400 #define BMH_BAD_PATTERN (-2) /* return value if pat is not ISO-Latin-1 */
402 extern jsint
403 js_BoyerMooreHorspool(const jschar *text, jsint textlen,
404 const jschar *pat, jsint patlen,
405 jsint start);
407 extern size_t
408 js_strlen(const jschar *s);
410 extern jschar *
411 js_strchr(const jschar *s, jschar c);
413 extern jschar *
414 js_strchr_limit(const jschar *s, jschar c, const jschar *limit);
416 #define js_strncpy(t, s, n) memcpy((t), (s), (n) * sizeof(jschar))
418 /*
419 * Return s advanced past any Unicode white space characters.
420 */
421 extern const jschar *
422 js_SkipWhiteSpace(const jschar *s);
424 /*
425 * Inflate bytes to JS chars and vice versa. Report out of memory via cx
426 * and return null on error, otherwise return the jschar or byte vector that
427 * was JS_malloc'ed. length is updated with the length of the new string in jschars.
428 */
429 extern jschar *
430 js_InflateString(JSContext *cx, const char *bytes, size_t *length);
432 extern char *
433 js_DeflateString(JSContext *cx, const jschar *chars, size_t length);
435 /*
436 * Inflate bytes to JS chars into a buffer.
437 * 'chars' must be large enough for 'length' jschars.
438 * The buffer is NOT null-terminated.
439 * cx may be NULL, which means no errors are thrown.
440 * The destination length needs to be initialized with the buffer size, takes the number of chars moved.
441 */
442 extern JSBool
443 js_InflateStringToBuffer(JSContext* cx, const char *bytes, size_t length, jschar *chars, size_t* charsLength);
445 /*
446 * Deflate JS chars to bytes into a buffer.
447 * 'bytes' must be large enough for 'length chars.
448 * The buffer is NOT null-terminated.
449 * cx may be NULL, which means no errors are thrown.
450 * The destination length needs to be initialized with the buffer size, takes the number of bytes moved.
451 */
452 extern JSBool
453 js_DeflateStringToBuffer(JSContext* cx, const jschar *chars, size_t charsLength, char *bytes, size_t* length);
455 /*
456 * Associate bytes with str in the deflated string cache, returning true on
457 * successful association, false on out of memory.
458 */
459 extern JSBool
460 js_SetStringBytes(JSString *str, char *bytes, size_t length);
462 /*
463 * Find or create a deflated string cache entry for str that contains its
464 * characters chopped from Unicode code points into bytes.
465 */
466 extern char *
467 js_GetStringBytes(JSString *str);
469 JSBool
470 js_str_escape(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
471 jsval *rval);
473 /*
474 * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at
475 * least 6 bytes long. Return the number of UTF-8 bytes of data written.
476 */
477 extern int
478 js_OneUcs4ToUtf8Char(uint8 *utf8Buffer, uint32 ucs4Char);
480 JS_END_EXTERN_C
482 #endif /* jsstr_h___ */