Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | # |
2 | # Copyright (c) 2001 Richard Jones, richard@bofh.asn.au. | |
3 | # This module is free software, and you may redistribute it and/or modify | |
4 | # under the same terms as Python, so long as this copyright message and | |
5 | # disclaimer are retained in their original form. | |
6 | # | |
7 | # This module is distributed in the hope that it will be useful, | |
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
10 | # | |
11 | # $Id: token.py,v 1.4 2004-02-11 23:55:08 richard Exp $ | |
12 | # | |
13 | ||
14 | """This module provides the tokeniser used by roundup-admin. | |
15 | """ | |
16 | __docformat__ = 'restructuredtext' | |
17 | ||
18 | def token_split(s, whitespace=' \r\n\t', quotes='\'"', | |
19 | escaped={'r':'\r', 'n':'\n', 't':'\t'}): | |
20 | '''Split the string up into tokens. An occurence of a ``'`` or ``"`` in | |
21 | the input will cause the splitter to ignore whitespace until a matching | |
22 | quote char is found. Embedded non-matching quote chars are also skipped. | |
23 | ||
24 | Whitespace and quoting characters may be escaped using a backslash. | |
25 | ``\r``, ``\n`` and ``\t`` are converted to carriage-return, newline and | |
26 | tab. All other backslashed characters are left as-is. | |
27 | ||
28 | Valid examples:: | |
29 | ||
30 | hello world (2 tokens: hello, world) | |
31 | "hello world" (1 token: hello world) | |
32 | "Roch'e" Compaan (2 tokens: Roch'e Compaan) | |
33 | Roch\'e Compaan (2 tokens: Roch'e Compaan) | |
34 | address="1 2 3" (1 token: address=1 2 3) | |
35 | \\ (1 token: \) | |
36 | \n (1 token: a newline) | |
37 | \o (1 token: \o) | |
38 | ||
39 | Invalid examples:: | |
40 | ||
41 | "hello world (no matching quote) | |
42 | Roch'e Compaan (no matching quote) | |
43 | ''' | |
44 | l = [] | |
45 | pos = 0 | |
46 | NEWTOKEN = 'newtoken' | |
47 | TOKEN = 'token' | |
48 | QUOTE = 'quote' | |
49 | ESCAPE = 'escape' | |
50 | quotechar = '' | |
51 | state = NEWTOKEN | |
52 | oldstate = '' # one-level state stack ;) | |
53 | length = len(s) | |
54 | finish = 0 | |
55 | token = '' | |
56 | while 1: | |
57 | # end of string, finish off the current token | |
58 | if pos == length: | |
59 | if state == QUOTE: raise ValueError, "unmatched quote" | |
60 | elif state == TOKEN: l.append(token) | |
61 | break | |
62 | c = s[pos] | |
63 | if state == NEWTOKEN: | |
64 | # looking for a new token | |
65 | if c in quotes: | |
66 | # quoted token | |
67 | state = QUOTE | |
68 | quotechar = c | |
69 | pos = pos + 1 | |
70 | continue | |
71 | elif c in whitespace: | |
72 | # skip whitespace | |
73 | pos = pos + 1 | |
74 | continue | |
75 | elif c == '\\': | |
76 | pos = pos + 1 | |
77 | oldstate = TOKEN | |
78 | state = ESCAPE | |
79 | continue | |
80 | # otherwise we have a token | |
81 | state = TOKEN | |
82 | elif state == TOKEN: | |
83 | if c in whitespace: | |
84 | # have a token, and have just found a whitespace terminator | |
85 | l.append(token) | |
86 | pos = pos + 1 | |
87 | state = NEWTOKEN | |
88 | token = '' | |
89 | continue | |
90 | elif c in quotes: | |
91 | # have a token, just found embedded quotes | |
92 | state = QUOTE | |
93 | quotechar = c | |
94 | pos = pos + 1 | |
95 | continue | |
96 | elif c == '\\': | |
97 | pos = pos + 1 | |
98 | oldstate = state | |
99 | state = ESCAPE | |
100 | continue | |
101 | elif state == QUOTE and c == quotechar: | |
102 | # in a quoted token and found a matching quote char | |
103 | pos = pos + 1 | |
104 | # now we're looking for whitespace | |
105 | state = TOKEN | |
106 | continue | |
107 | elif state == ESCAPE: | |
108 | # escaped-char conversions (t, r, n) | |
109 | # TODO: octal, hexdigit | |
110 | state = oldstate | |
111 | if escaped.has_key(c): | |
112 | c = escaped[c] | |
113 | # just add this char to the token and move along | |
114 | token = token + c | |
115 | pos = pos + 1 | |
116 | return l | |
117 | ||
118 | # vim: set filetype=python ts=4 sw=4 et si |