roundup/token.py

   1 #
   2 # Copyright (c) 2001 Richard Jones, richard@bofh.asn.au.
   3 # This module is free software, and you may redistribute it and/or modify
   4 # under the same terms as Python, so long as this copyright message and
   5 # disclaimer are retained in their original form.
   6 #
   7 # This module is distributed in the hope that it will be useful,
   8 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   9 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10 #
  11 # $Id: token.py,v 1.4 2004-02-11 23:55:08 richard Exp $
  12 #
  13
  14 """This module provides the tokeniser used by roundup-admin.
  15 """
  16 __docformat__ = 'restructuredtext'
  17
  18 def token_split(s, whitespace=' \r\n\t', quotes='\'"',
  19         escaped={'r':'\r', 'n':'\n', 't':'\t'}):
  20     '''Split the string up into tokens. An occurence of a ``'`` or ``"`` in
  21     the input will cause the splitter to ignore whitespace until a matching
  22     quote char is found. Embedded non-matching quote chars are also skipped.
  23
  24     Whitespace and quoting characters may be escaped using a backslash.
  25     ``\r``, ``\n`` and ``\t`` are converted to carriage-return, newline and
  26     tab.  All other backslashed characters are left as-is.
  27
  28     Valid examples::
  29
  30            hello world      (2 tokens: hello, world)
  31            "hello world"    (1 token: hello world)
  32            "Roch'e" Compaan (2 tokens: Roch'e Compaan)
  33            Roch\'e Compaan  (2 tokens: Roch'e Compaan)
  34            address="1 2 3"  (1 token: address=1 2 3)
  35            \\               (1 token: \)
  36            \n               (1 token: a newline)
  37            \o               (1 token: \o)
  38
  39     Invalid examples::
  40
  41            "hello world     (no matching quote)
  42            Roch'e Compaan   (no matching quote)
  43     '''
  44     l = []
  45     pos = 0
  46     NEWTOKEN = 'newtoken'
  47     TOKEN = 'token'
  48     QUOTE = 'quote'
  49     ESCAPE = 'escape'
  50     quotechar = ''
  51     state = NEWTOKEN
  52     oldstate = ''    # one-level state stack ;)
  53     length = len(s)
  54     finish = 0
  55     token = ''
  56     while 1:
  57         # end of string, finish off the current token
  58         if pos == length:
  59             if state == QUOTE: raise ValueError, "unmatched quote"
  60             elif state == TOKEN: l.append(token)
  61             break
  62         c = s[pos]
  63         if state == NEWTOKEN:
  64             # looking for a new token
  65             if c in quotes:
  66                 # quoted token
  67                 state = QUOTE
  68                 quotechar = c
  69                 pos = pos + 1
  70                 continue
  71             elif c in whitespace:
  72                 # skip whitespace
  73                 pos = pos + 1
  74                 continue
  75             elif c == '\\':
  76                 pos = pos + 1
  77                 oldstate = TOKEN
  78                 state = ESCAPE
  79                 continue
  80             # otherwise we have a token
  81             state = TOKEN
  82         elif state == TOKEN:
  83             if c in whitespace:
  84                 # have a token, and have just found a whitespace terminator
  85                 l.append(token)
  86                 pos = pos + 1
  87                 state = NEWTOKEN
  88                 token = ''
  89                 continue
  90             elif c in quotes:
  91                 # have a token, just found embedded quotes
  92                 state = QUOTE
  93                 quotechar = c
  94                 pos = pos + 1
  95                 continue
  96             elif c == '\\':
  97                 pos = pos + 1
  98                 oldstate = state
  99                 state = ESCAPE
 100                 continue
 101         elif state == QUOTE and c == quotechar:
 102             # in a quoted token and found a matching quote char
 103             pos = pos + 1
 104             # now we're looking for whitespace
 105             state = TOKEN
 106             continue
 107         elif state == ESCAPE:
 108             # escaped-char conversions (t, r, n)
 109             # TODO: octal, hexdigit
 110             state = oldstate
 111             if escaped.has_key(c):
 112                 c = escaped[c]
 113         # just add this char to the token and move along
 114         token = token + c
 115         pos = pos + 1
 116     return l
 117
 118 # vim: set filetype=python ts=4 sw=4 et si