1 #
2 # Copyright (c) 2001 Richard Jones, richard@bofh.asn.au.
3 # This module is free software, and you may redistribute it and/or modify
4 # under the same terms as Python, so long as this copyright message and
5 # disclaimer are retained in their original form.
6 #
7 # This module is distributed in the hope that it will be useful,
8 # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10 #
11 # $Id: token.py,v 1.3 2002-09-10 00:18:20 richard Exp $
12 #
14 __doc__ = """
15 This module provides the tokeniser used by roundup-admin.
16 """
18 def token_split(s, whitespace=' \r\n\t', quotes='\'"',
19 escaped={'r':'\r', 'n':'\n', 't':'\t'}):
20 '''Split the string up into tokens. An occurence of a ' or " in the
21 input will cause the splitter to ignore whitespace until a matching
22 quote char is found. Embedded non-matching quote chars are also
23 skipped.
24 Whitespace and quoting characters may be escaped using a backslash.
25 \r, \n and \t are converted to carriage-return, newline and tab.
26 All other backslashed characters are left as-is.
27 Valid:
28 hello world (2 tokens: hello, world)
29 "hello world" (1 token: hello world)
30 "Roch'e" Compaan (2 tokens: Roch'e Compaan)
31 Roch\'e Compaan (2 tokens: Roch'e Compaan)
32 address="1 2 3" (1 token: address=1 2 3)
33 \\ (1 token: \)
34 \n (1 token: a newline)
35 \o (1 token: \o)
36 Invalid:
37 "hello world (no matching quote)
38 Roch'e Compaan (no matching quote)
39 '''
40 l = []
41 pos = 0
42 NEWTOKEN = 'newtoken'
43 TOKEN = 'token'
44 QUOTE = 'quote'
45 ESCAPE = 'escape'
46 quotechar = ''
47 state = NEWTOKEN
48 oldstate = '' # one-level state stack ;)
49 length = len(s)
50 finish = 0
51 token = ''
52 while 1:
53 # end of string, finish off the current token
54 if pos == length:
55 if state == QUOTE: raise ValueError, "unmatched quote"
56 elif state == TOKEN: l.append(token)
57 break
58 c = s[pos]
59 if state == NEWTOKEN:
60 # looking for a new token
61 if c in quotes:
62 # quoted token
63 state = QUOTE
64 quotechar = c
65 pos = pos + 1
66 continue
67 elif c in whitespace:
68 # skip whitespace
69 pos = pos + 1
70 continue
71 elif c == '\\':
72 pos = pos + 1
73 oldstate = TOKEN
74 state = ESCAPE
75 continue
76 # otherwise we have a token
77 state = TOKEN
78 elif state == TOKEN:
79 if c in whitespace:
80 # have a token, and have just found a whitespace terminator
81 l.append(token)
82 pos = pos + 1
83 state = NEWTOKEN
84 token = ''
85 continue
86 elif c in quotes:
87 # have a token, just found embedded quotes
88 state = QUOTE
89 quotechar = c
90 pos = pos + 1
91 continue
92 elif c == '\\':
93 pos = pos + 1
94 oldstate = state
95 state = ESCAPE
96 continue
97 elif state == QUOTE and c == quotechar:
98 # in a quoted token and found a matching quote char
99 pos = pos + 1
100 # now we're looking for whitespace
101 state = TOKEN
102 continue
103 elif state == ESCAPE:
104 # escaped-char conversions (t, r, n)
105 # TODO: octal, hexdigit
106 state = oldstate
107 if escaped.has_key(c):
108 c = escaped[c]
109 # just add this char to the token and move along
110 token = token + c
111 pos = pos + 1
112 return l
114 # vim: set filetype=python ts=4 sw=4 et si