1a67e1e467b02a38348df67cf66aea6797264623
1 #! /usr/bin/env python
2 # Originally written by Barry Warsaw <bwarsaw@python.org>
3 #
4 # minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
7 # for selftesting
8 try:
9 import fintl
10 _ = fintl.gettext
11 except ImportError:
12 _ = lambda s: s
15 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
17 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
18 internationalization of C programs. Most of these tools are independent of
19 the programming language and can be used from within Python programs. Martin
20 von Loewis' work[1] helps considerably in this regard.
22 There's one problem though; xgettext is the program that scans source code
23 looking for message strings, but it groks only C (or C++). Python introduces
24 a few wrinkles, such as dual quoting characters, triple quoted strings, and
25 raw strings. xgettext understands none of this.
27 Enter pygettext, which uses Python's standard tokenize module to scan Python
28 source code, generating .pot files identical to what GNU xgettext[2] generates
29 for C and C++ code. From there, the standard GNU tools can be used.
31 A word about marking Python strings as candidates for translation. GNU
32 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
33 gettext_noop. But those can be a lot of text to include all over your code.
34 C and C++ have a trick: they use the C preprocessor. Most internationalized C
35 source includes a #define for gettext() to _() so that what has to be written
36 in the source is much less. Thus these are both translatable strings:
38 gettext("Translatable String")
39 _("Translatable String")
41 Python of course has no preprocessor so this doesn't work so well. Thus,
42 pygettext searches only for _() by default, but see the -k/--keyword flag
43 below for how to augment this.
45 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
46 [2] http://www.gnu.org/software/gettext/gettext.html
48 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
49 where ever possible. However some options are still missing or are not fully
50 implemented. Also, xgettext's use of command line switches with option
51 arguments is broken, and in these cases, pygettext just defines additional
52 switches.
54 Usage: pygettext [options] inputfile ...
56 Options:
58 -a
59 --extract-all
60 Extract all strings
62 -d name
63 --default-domain=name
64 Rename the default output file from messages.pot to name.pot
66 -E
67 --escape
68 replace non-ASCII characters with octal escape sequences.
70 -h
71 --help
72 print this help message and exit
74 -k word
75 --keyword=word
76 Keywords to look for in addition to the default set, which are:
77 %(DEFAULTKEYWORDS)s
79 You can have multiple -k flags on the command line.
81 -K
82 --no-default-keywords
83 Disable the default set of keywords (see above). Any keywords
84 explicitly added with the -k/--keyword option are still recognized.
86 --no-location
87 Do not write filename/lineno location comments.
89 -n
90 --add-location
91 Write filename/lineno location comments indicating where each
92 extracted string is found in the source. These lines appear before
93 each msgid. The style of comments is controlled by the -S/--style
94 option. This is the default.
96 -S stylename
97 --style stylename
98 Specify which style to use for location comments. Two styles are
99 supported:
101 Solaris # File: filename, line: line-number
102 GNU #: filename:line
104 The style name is case insensitive. GNU style is the default.
106 -o filename
107 --output=filename
108 Rename the default output file from messages.pot to filename. If
109 filename is `-' then the output is sent to standard out.
111 -p dir
112 --output-dir=dir
113 Output files will be placed in directory dir.
115 -v
116 --verbose
117 Print the names of the files being processed.
119 -V
120 --version
121 Print the version of pygettext and exit.
123 -w columns
124 --width=columns
125 Set width of output to columns.
127 -x filename
128 --exclude-file=filename
129 Specify a file that contains a list of strings that are not be
130 extracted from the input files. Each string to be excluded must
131 appear on a line by itself in the file.
133 If `inputfile' is -, standard input is read.
135 """)
137 import os
138 import sys
139 import time
140 import getopt
141 import token
142 import tokenize
144 __version__ = '1.1'
146 default_keywords = ['_']
147 DEFAULTKEYWORDS = ', '.join(default_keywords)
149 EMPTYSTRING = ''
152 \f
153 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
154 # it's there.
155 pot_header = _('''\
156 # SOME DESCRIPTIVE TITLE.
157 # Copyright (C) YEAR ORGANIZATION
158 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
159 #
160 msgid ""
161 msgstr ""
162 "Project-Id-Version: PACKAGE VERSION\\n"
163 "PO-Revision-Date: %(time)s\\n"
164 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
165 "Language-Team: LANGUAGE <LL@li.org>\\n"
166 "MIME-Version: 1.0\\n"
167 "Content-Type: text/plain; charset=CHARSET\\n"
168 "Content-Transfer-Encoding: ENCODING\\n"
169 "Generated-By: pygettext.py %(version)s\\n"
171 ''')
173 \f
174 def usage(code, msg=''):
175 print __doc__ % globals()
176 if msg:
177 print msg
178 sys.exit(code)
181 \f
182 escapes = []
184 def make_escapes(pass_iso8859):
185 global escapes
186 if pass_iso8859:
187 # Allow iso-8859 characters to pass through so that e.g. 'msgid
188 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
189 # escape any character outside the 32..126 range.
190 mod = 128
191 else:
192 mod = 256
193 for i in range(256):
194 if 32 <= (i % mod) <= 126:
195 escapes.append(chr(i))
196 else:
197 escapes.append("\\%03o" % i)
198 escapes[ord('\\')] = '\\\\'
199 escapes[ord('\t')] = '\\t'
200 escapes[ord('\r')] = '\\r'
201 escapes[ord('\n')] = '\\n'
202 escapes[ord('\"')] = '\\"'
205 def escape(s):
206 global escapes
207 s = list(s)
208 for i in range(len(s)):
209 s[i] = escapes[ord(s[i])]
210 return EMPTYSTRING.join(s)
213 def safe_eval(s):
214 # unwrap quotes, safely
215 return eval(s, {'__builtins__':{}}, {})
218 def normalize(s):
219 # This converts the various Python string types into a format that is
220 # appropriate for .po files, namely much closer to C style.
221 lines = s.split('\n')
222 if len(lines) == 1:
223 s = '"' + escape(s) + '"'
224 else:
225 if not lines[-1]:
226 del lines[-1]
227 lines[-1] = lines[-1] + '\n'
228 for i in range(len(lines)):
229 lines[i] = escape(lines[i])
230 lineterm = '\\n"\n"'
231 s = '""\n"' + lineterm.join(lines) + '"'
232 return s
235 \f
236 class TokenEater:
237 def __init__(self, options):
238 self.__options = options
239 self.__messages = {}
240 self.__state = self.__waiting
241 self.__data = []
242 self.__lineno = -1
244 def __call__(self, ttype, tstring, stup, etup, line):
245 # dispatch
246 self.__state(ttype, tstring, stup[0])
248 def __waiting(self, ttype, tstring, lineno):
249 if ttype == tokenize.NAME and tstring in self.__options.keywords:
250 self.__state = self.__keywordseen
252 def __keywordseen(self, ttype, tstring, lineno):
253 if ttype == tokenize.OP and tstring == '(':
254 self.__data = []
255 self.__lineno = lineno
256 self.__state = self.__openseen
257 else:
258 self.__state = self.__waiting
260 def __openseen(self, ttype, tstring, lineno):
261 if ttype == tokenize.OP and tstring == ')':
262 # We've seen the last of the translatable strings. Record the
263 # line number of the first line of the strings and update the list
264 # of messages seen. Reset state for the next batch. If there
265 # were no strings inside _(), then just ignore this entry.
266 if self.__data:
267 msg = EMPTYSTRING.join(self.__data)
268 if not msg in self.__options.toexclude:
269 entry = (self.__curfile, self.__lineno)
270 linenos = self.__messages.get(msg)
271 if linenos is None:
272 self.__messages[msg] = [entry]
273 else:
274 linenos.append(entry)
275 self.__state = self.__waiting
276 elif ttype == tokenize.STRING:
277 self.__data.append(safe_eval(tstring))
278 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
279 token.NEWLINE, tokenize.NL]:
280 # warn if we see anything else than STRING or whitespace
281 print >>sys.stderr, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
282 'token': tstring, 'file': self.__curfile, 'lineno': self.__lineno}
283 self.__state = self.__waiting
285 def set_filename(self, filename):
286 self.__curfile = filename
288 def write(self, fp):
289 options = self.__options
290 timestamp = time.ctime(time.time())
291 # common header
292 try:
293 sys.stdout = fp
294 # The time stamp in the header doesn't have the same format
295 # as that generated by xgettext...
296 print pot_header % {'time': timestamp, 'version': __version__}
297 for k, v in self.__messages.items():
298 if not options.writelocations:
299 pass
300 # location comments are different b/w Solaris and GNU:
301 elif options.locationstyle == options.SOLARIS:
302 for filename, lineno in v:
303 d = {'filename': filename, 'lineno': lineno}
304 print _('# File: %(filename)s, line: %(lineno)d') % d
305 elif options.locationstyle == options.GNU:
306 # fit as many locations on one line, as long as the
307 # resulting line length doesn't exceeds 'options.width'
308 locline = '#:'
309 for filename, lineno in v:
310 d = {'filename': filename, 'lineno': lineno}
311 s = _(' %(filename)s:%(lineno)d') % d
312 if len(locline) + len(s) <= options.width:
313 locline = locline + s
314 else:
315 print locline
316 locline = "#:" + s
317 if len(locline) > 2:
318 print locline
319 # TBD: sorting, normalizing
320 print 'msgid', normalize(k)
321 print 'msgstr ""\n'
322 finally:
323 sys.stdout = sys.__stdout__
325 \f
326 def main():
327 global default_keywords
328 try:
329 opts, args = getopt.getopt(
330 sys.argv[1:],
331 'ad:Ehk:Kno:p:S:Vvw:x:',
332 ['extract-all', 'default-domain', 'escape', 'help',
333 'keyword=', 'no-default-keywords',
334 'add-location', 'no-location', 'output=', 'output-dir=',
335 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
336 ])
337 except getopt.error, msg:
338 usage(1, msg)
340 # for holding option values
341 class Options:
342 # constants
343 GNU = 1
344 SOLARIS = 2
345 # defaults
346 extractall = 0 # FIXME: currently this option has no effect at all.
347 escape = 0
348 keywords = []
349 outpath = ''
350 outfile = 'messages.pot'
351 writelocations = 1
352 locationstyle = GNU
353 verbose = 0
354 width = 78
355 excludefilename = ''
357 options = Options()
358 locations = {'gnu' : options.GNU,
359 'solaris' : options.SOLARIS,
360 }
362 # parse options
363 for opt, arg in opts:
364 if opt in ('-h', '--help'):
365 usage(0)
366 elif opt in ('-a', '--extract-all'):
367 options.extractall = 1
368 elif opt in ('-d', '--default-domain'):
369 options.outfile = arg + '.pot'
370 elif opt in ('-E', '--escape'):
371 options.escape = 1
372 elif opt in ('-k', '--keyword'):
373 options.keywords.append(arg)
374 elif opt in ('-K', '--no-default-keywords'):
375 default_keywords = []
376 elif opt in ('-n', '--add-location'):
377 options.writelocations = 1
378 elif opt in ('--no-location',):
379 options.writelocations = 0
380 elif opt in ('-S', '--style'):
381 options.locationstyle = locations.get(arg.lower())
382 if options.locationstyle is None:
383 usage(1, _('Invalid value for --style: %s') % arg)
384 elif opt in ('-o', '--output'):
385 options.outfile = arg
386 elif opt in ('-p', '--output-dir'):
387 options.outpath = arg
388 elif opt in ('-v', '--verbose'):
389 options.verbose = 1
390 elif opt in ('-V', '--version'):
391 print _('pygettext.py (xgettext for Python) %s') % __version__
392 sys.exit(0)
393 elif opt in ('-w', '--width'):
394 try:
395 options.width = int(arg)
396 except ValueError:
397 usage(1, _('--width argument must be an integer: %s') % arg)
398 elif opt in ('-x', '--exclude-file'):
399 options.excludefilename = arg
401 # calculate escapes
402 make_escapes(options.escape)
404 # calculate all keywords
405 options.keywords.extend(default_keywords)
407 # initialize list of strings to exclude
408 if options.excludefilename:
409 try:
410 fp = open(options.excludefilename)
411 options.toexclude = fp.readlines()
412 fp.close()
413 except IOError:
414 sys.stderr.write(_("Can't read --exclude-file: %s") %
415 options.excludefilename)
416 sys.exit(1)
417 else:
418 options.toexclude = []
420 # on win32, do internal globbing
421 if sys.platform == 'win32':
422 import glob
423 expanded = []
424 for arg in args:
425 expanded.extend(glob.glob(arg))
426 args = expanded
428 # slurp through all the files
429 eater = TokenEater(options)
430 for filename in args:
431 if filename == '-':
432 if options.verbose:
433 print _('Reading standard input')
434 fp = sys.stdin
435 closep = 0
436 else:
437 if options.verbose:
438 print _('Working on %s') % filename
439 fp = open(filename)
440 closep = 1
441 try:
442 eater.set_filename(filename)
443 tokenize.tokenize(fp.readline, eater)
444 finally:
445 if closep:
446 fp.close()
448 # write the output
449 if options.outfile == '-':
450 fp = sys.stdout
451 closep = 0
452 else:
453 if options.outpath:
454 options.outfile = os.path.join(options.outpath, options.outfile)
455 fp = open(options.outfile, 'w')
456 closep = 1
457 try:
458 eater.write(fp)
459 finally:
460 if closep:
461 fp.close()
463 \f
464 if __name__ == '__main__':
465 main()
466 # some more test strings
467 _(u'a unicode string')
468 _('*** Seen unexpected token "%(token)s"' % {'token': 'test'})
469 _('more' 'than' 'one' 'string')