roundup/token.py

   1 #
   2 # Copyright (c) 2001 Richard Jones, richard@bofh.asn.au.
   3 # This module is free software, and you may redistribute it and/or modify
   4 # under the same terms as Python, so long as this copyright message and
   5 # disclaimer are retained in their original form.
   6 #
   7 # This module is distributed in the hope that it will be useful,
   8 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   9 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10 #
  11 # $Id: token.py,v 1.3 2002-09-10 00:18:20 richard Exp $
  12 #
  13
  14 __doc__ = """
  15 This module provides the tokeniser used by roundup-admin.
  16 """
  17
  18 def token_split(s, whitespace=' \r\n\t', quotes='\'"',
  19         escaped={'r':'\r', 'n':'\n', 't':'\t'}):
  20     '''Split the string up into tokens. An occurence of a ' or " in the
  21        input will cause the splitter to ignore whitespace until a matching
  22        quote char is found. Embedded non-matching quote chars are also
  23        skipped.
  24        Whitespace and quoting characters may be escaped using a backslash.
  25        \r, \n and \t are converted to carriage-return, newline and tab.
  26        All other backslashed characters are left as-is.
  27        Valid:
  28            hello world      (2 tokens: hello, world)
  29            "hello world"    (1 token: hello world)
  30            "Roch'e" Compaan (2 tokens: Roch'e Compaan)
  31            Roch\'e Compaan  (2 tokens: Roch'e Compaan)
  32            address="1 2 3"  (1 token: address=1 2 3)
  33            \\               (1 token: \)
  34            \n               (1 token: a newline)
  35            \o               (1 token: \o)
  36        Invalid:
  37            "hello world     (no matching quote)
  38            Roch'e Compaan   (no matching quote)
  39     '''
  40     l = []
  41     pos = 0
  42     NEWTOKEN = 'newtoken'
  43     TOKEN = 'token'
  44     QUOTE = 'quote'
  45     ESCAPE = 'escape'
  46     quotechar = ''
  47     state = NEWTOKEN
  48     oldstate = ''    # one-level state stack ;)
  49     length = len(s)
  50     finish = 0
  51     token = ''
  52     while 1:
  53         # end of string, finish off the current token
  54         if pos == length:
  55             if state == QUOTE: raise ValueError, "unmatched quote"
  56             elif state == TOKEN: l.append(token)
  57             break
  58         c = s[pos]
  59         if state == NEWTOKEN:
  60             # looking for a new token
  61             if c in quotes:
  62                 # quoted token
  63                 state = QUOTE
  64                 quotechar = c
  65                 pos = pos + 1
  66                 continue
  67             elif c in whitespace:
  68                 # skip whitespace
  69                 pos = pos + 1
  70                 continue
  71             elif c == '\\':
  72                 pos = pos + 1
  73                 oldstate = TOKEN
  74                 state = ESCAPE
  75                 continue
  76             # otherwise we have a token
  77             state = TOKEN
  78         elif state == TOKEN:
  79             if c in whitespace:
  80                 # have a token, and have just found a whitespace terminator
  81                 l.append(token)
  82                 pos = pos + 1
  83                 state = NEWTOKEN
  84                 token = ''
  85                 continue
  86             elif c in quotes:
  87                 # have a token, just found embedded quotes
  88                 state = QUOTE
  89                 quotechar = c
  90                 pos = pos + 1
  91                 continue
  92             elif c == '\\':
  93                 pos = pos + 1
  94                 oldstate = state
  95                 state = ESCAPE
  96                 continue
  97         elif state == QUOTE and c == quotechar:
  98             # in a quoted token and found a matching quote char
  99             pos = pos + 1
 100             # now we're looking for whitespace
 101             state = TOKEN
 102             continue
 103         elif state == ESCAPE:
 104             # escaped-char conversions (t, r, n)
 105             # TODO: octal, hexdigit
 106             state = oldstate
 107             if escaped.has_key(c):
 108                 c = escaped[c]
 109         # just add this char to the token and move along
 110         token = token + c
 111         pos = pos + 1
 112     return l
 113
 114 # vim: set filetype=python ts=4 sw=4 et si