1 #
2 # Copyright (c) 2001 Richard Jones, richard@bofh.asn.au.
3 # This module is free software, and you may redistribute it and/or modify
4 # under the same terms as Python, so long as this copyright message and
5 # disclaimer are retained in their original form.
6 #
7 # This module is distributed in the hope that it will be useful,
8 # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10 #
11 # $Id: token.py,v 1.4 2004-02-11 23:55:08 richard Exp $
12 #
14 """This module provides the tokeniser used by roundup-admin.
15 """
16 __docformat__ = 'restructuredtext'
18 def token_split(s, whitespace=' \r\n\t', quotes='\'"',
19 escaped={'r':'\r', 'n':'\n', 't':'\t'}):
20 '''Split the string up into tokens. An occurence of a ``'`` or ``"`` in
21 the input will cause the splitter to ignore whitespace until a matching
22 quote char is found. Embedded non-matching quote chars are also skipped.
24 Whitespace and quoting characters may be escaped using a backslash.
25 ``\r``, ``\n`` and ``\t`` are converted to carriage-return, newline and
26 tab. All other backslashed characters are left as-is.
28 Valid examples::
30 hello world (2 tokens: hello, world)
31 "hello world" (1 token: hello world)
32 "Roch'e" Compaan (2 tokens: Roch'e Compaan)
33 Roch\'e Compaan (2 tokens: Roch'e Compaan)
34 address="1 2 3" (1 token: address=1 2 3)
35 \\ (1 token: \)
36 \n (1 token: a newline)
37 \o (1 token: \o)
39 Invalid examples::
41 "hello world (no matching quote)
42 Roch'e Compaan (no matching quote)
43 '''
44 l = []
45 pos = 0
46 NEWTOKEN = 'newtoken'
47 TOKEN = 'token'
48 QUOTE = 'quote'
49 ESCAPE = 'escape'
50 quotechar = ''
51 state = NEWTOKEN
52 oldstate = '' # one-level state stack ;)
53 length = len(s)
54 finish = 0
55 token = ''
56 while 1:
57 # end of string, finish off the current token
58 if pos == length:
59 if state == QUOTE: raise ValueError, "unmatched quote"
60 elif state == TOKEN: l.append(token)
61 break
62 c = s[pos]
63 if state == NEWTOKEN:
64 # looking for a new token
65 if c in quotes:
66 # quoted token
67 state = QUOTE
68 quotechar = c
69 pos = pos + 1
70 continue
71 elif c in whitespace:
72 # skip whitespace
73 pos = pos + 1
74 continue
75 elif c == '\\':
76 pos = pos + 1
77 oldstate = TOKEN
78 state = ESCAPE
79 continue
80 # otherwise we have a token
81 state = TOKEN
82 elif state == TOKEN:
83 if c in whitespace:
84 # have a token, and have just found a whitespace terminator
85 l.append(token)
86 pos = pos + 1
87 state = NEWTOKEN
88 token = ''
89 continue
90 elif c in quotes:
91 # have a token, just found embedded quotes
92 state = QUOTE
93 quotechar = c
94 pos = pos + 1
95 continue
96 elif c == '\\':
97 pos = pos + 1
98 oldstate = state
99 state = ESCAPE
100 continue
101 elif state == QUOTE and c == quotechar:
102 # in a quoted token and found a matching quote char
103 pos = pos + 1
104 # now we're looking for whitespace
105 state = TOKEN
106 continue
107 elif state == ESCAPE:
108 # escaped-char conversions (t, r, n)
109 # TODO: octal, hexdigit
110 state = oldstate
111 if escaped.has_key(c):
112 c = escaped[c]
113 # just add this char to the token and move along
114 token = token + c
115 pos = pos + 1
116 return l
118 # vim: set filetype=python ts=4 sw=4 et si