b9f8419c7c96b6e0f545cfd96e3c46597d2874d7
1 #! /usr/bin/env python
2 # Originally written by Barry Warsaw <bwarsaw@python.org>
3 #
4 # minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
6 #
7 # 2001-11-21 Jürgen Hermann <jh@web.de>
8 # Checks that _() only contains string literals added, and
9 # command line args are resolved to module lists, i.e. you
10 # can now pass a filename, a module or package name, or a
11 # directory (including globbing chars, important for Win32).
12 #
14 # for selftesting
15 try:
16 import fintl
17 _ = fintl.gettext
18 except ImportError:
19 _ = lambda s: s
22 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
25 internationalization of C programs. Most of these tools are independent of
26 the programming language and can be used from within Python programs. Martin
27 von Loewis' work[1] helps considerably in this regard.
29 There's one problem though; xgettext is the program that scans source code
30 looking for message strings, but it groks only C (or C++). Python introduces
31 a few wrinkles, such as dual quoting characters, triple quoted strings, and
32 raw strings. xgettext understands none of this.
34 Enter pygettext, which uses Python's standard tokenize module to scan Python
35 source code, generating .pot files identical to what GNU xgettext[2] generates
36 for C and C++ code. From there, the standard GNU tools can be used.
38 A word about marking Python strings as candidates for translation. GNU
39 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
40 gettext_noop. But those can be a lot of text to include all over your code.
41 C and C++ have a trick: they use the C preprocessor. Most internationalized C
42 source includes a #define for gettext() to _() so that what has to be written
43 in the source is much less. Thus these are both translatable strings:
45 gettext("Translatable String")
46 _("Translatable String")
48 Python of course has no preprocessor so this doesn't work so well. Thus,
49 pygettext searches only for _() by default, but see the -k/--keyword flag
50 below for how to augment this.
52 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
53 [2] http://www.gnu.org/software/gettext/gettext.html
55 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
56 where ever possible. However some options are still missing or are not fully
57 implemented. Also, xgettext's use of command line switches with option
58 arguments is broken, and in these cases, pygettext just defines additional
59 switches.
61 Usage: pygettext [options] inputfile ...
63 Options:
65 -a
66 --extract-all
67 Extract all strings
69 -d name
70 --default-domain=name
71 Rename the default output file from messages.pot to name.pot
73 -E
74 --escape
75 replace non-ASCII characters with octal escape sequences.
77 -h
78 --help
79 print this help message and exit
81 -k word
82 --keyword=word
83 Keywords to look for in addition to the default set, which are:
84 %(DEFAULTKEYWORDS)s
86 You can have multiple -k flags on the command line.
88 -K
89 --no-default-keywords
90 Disable the default set of keywords (see above). Any keywords
91 explicitly added with the -k/--keyword option are still recognized.
93 --no-location
94 Do not write filename/lineno location comments.
96 -n
97 --add-location
98 Write filename/lineno location comments indicating where each
99 extracted string is found in the source. These lines appear before
100 each msgid. The style of comments is controlled by the -S/--style
101 option. This is the default.
103 -S stylename
104 --style stylename
105 Specify which style to use for location comments. Two styles are
106 supported:
108 Solaris # File: filename, line: line-number
109 GNU #: filename:line
111 The style name is case insensitive. GNU style is the default.
113 -o filename
114 --output=filename
115 Rename the default output file from messages.pot to filename. If
116 filename is `-' then the output is sent to standard out.
118 -p dir
119 --output-dir=dir
120 Output files will be placed in directory dir.
122 -v
123 --verbose
124 Print the names of the files being processed.
126 -V
127 --version
128 Print the version of pygettext and exit.
130 -w columns
131 --width=columns
132 Set width of output to columns.
134 -x filename
135 --exclude-file=filename
136 Specify a file that contains a list of strings that are not be
137 extracted from the input files. Each string to be excluded must
138 appear on a line by itself in the file.
140 If `inputfile' is -, standard input is read.
142 """)
144 import os
145 import sys
146 import time
147 import getopt
148 import token
149 import tokenize
151 __version__ = '1.1'
153 default_keywords = ['_']
154 DEFAULTKEYWORDS = ', '.join(default_keywords)
156 EMPTYSTRING = ''
159 \f
160 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
161 # it's there.
162 pot_header = _('''\
163 # SOME DESCRIPTIVE TITLE.
164 # Copyright (C) YEAR ORGANIZATION
165 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
166 #
167 msgid ""
168 msgstr ""
169 "Project-Id-Version: PACKAGE VERSION\\n"
170 "PO-Revision-Date: %(time)s\\n"
171 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
172 "Language-Team: LANGUAGE <LL@li.org>\\n"
173 "MIME-Version: 1.0\\n"
174 "Content-Type: text/plain; charset=CHARSET\\n"
175 "Content-Transfer-Encoding: ENCODING\\n"
176 "Generated-By: pygettext.py %(version)s\\n"
178 ''')
180 \f
181 def usage(code, msg=''):
182 print __doc__ % globals()
183 if msg:
184 print msg
185 sys.exit(code)
188 \f
189 escapes = []
191 def make_escapes(pass_iso8859):
192 global escapes
193 if pass_iso8859:
194 # Allow iso-8859 characters to pass through so that e.g. 'msgid
195 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
196 # escape any character outside the 32..126 range.
197 mod = 128
198 else:
199 mod = 256
200 for i in range(256):
201 if 32 <= (i % mod) <= 126:
202 escapes.append(chr(i))
203 else:
204 escapes.append("\\%03o" % i)
205 escapes[ord('\\')] = '\\\\'
206 escapes[ord('\t')] = '\\t'
207 escapes[ord('\r')] = '\\r'
208 escapes[ord('\n')] = '\\n'
209 escapes[ord('\"')] = '\\"'
212 def escape(s):
213 global escapes
214 s = list(s)
215 for i in range(len(s)):
216 s[i] = escapes[ord(s[i])]
217 return EMPTYSTRING.join(s)
220 def safe_eval(s):
221 # unwrap quotes, safely
222 return eval(s, {'__builtins__':{}}, {})
225 def normalize(s):
226 # This converts the various Python string types into a format that is
227 # appropriate for .po files, namely much closer to C style.
228 lines = s.split('\n')
229 if len(lines) == 1:
230 s = '"' + escape(s) + '"'
231 else:
232 if not lines[-1]:
233 del lines[-1]
234 lines[-1] = lines[-1] + '\n'
235 for i in range(len(lines)):
236 lines[i] = escape(lines[i])
237 lineterm = '\\n"\n"'
238 s = '""\n"' + lineterm.join(lines) + '"'
239 return s
241 \f
243 def containsAny(str, set):
244 """ Check whether 'str' contains ANY of the chars in 'set'
245 """
246 return 1 in [c in str for c in set]
249 def _visit_pyfiles(list, dirname, names):
250 """ Helper for getFilesForName().
251 """
252 # get extension for python source files
253 if not globals().has_key('_py_ext'):
254 import imp
255 global _py_ext
256 _py_ext = [triple[0] for triple in imp.get_suffixes() if triple[2] == imp.PY_SOURCE][0]
258 # don't recurse into CVS directories
259 if 'CVS' in names:
260 names.remove('CVS')
262 # add all *.py files to list
263 list.extend(
264 [os.path.join(dirname, file)
265 for file in names
266 if os.path.splitext(file)[1] == _py_ext])
269 def _get_modpkg_path(dotted_name, pathlist=None):
270 """ Get the filesystem path for a module or a package.
272 Return the file system path to a file for a module,
273 and to a directory for a package. Return None if
274 the name is not found, or is a builtin or extension module.
275 """
276 import imp
278 # split off top-most name
279 parts = dotted_name.split('.', 1)
281 if len(parts) > 1:
282 # we have a dotted path, import top-level package
283 try:
284 file, pathname, description = imp.find_module(parts[0], pathlist)
285 if file: file.close()
286 except ImportError:
287 return None
289 # check if it's indeed a package
290 if description[2] == imp.PKG_DIRECTORY:
291 # recursively handle the remaining name parts
292 pathname = _get_modpkg_path(parts[1], [pathname])
293 else:
294 pathname = None
295 else:
296 # plain name
297 try:
298 file, pathname, description = imp.find_module(dotted_name, pathlist)
299 if file: file.close()
300 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
301 pathname = None
302 except ImportError:
303 pathname = None
305 return pathname
308 def getFilesForName(name):
309 """ Get a list of module files for a filename, a module or package name,
310 or a directory.
311 """
312 import imp
314 if not os.path.exists(name):
315 # check for glob chars
316 if containsAny(name, "*?[]"):
317 import glob
318 files = glob.glob(name)
319 list = []
320 for file in files:
321 list.extend(getFilesForName(file))
322 return list
324 # try to find module or package
325 name = _get_modpkg_path(name)
326 if not name:
327 return []
329 if os.path.isdir(name):
330 # find all python files in directory
331 list = []
332 os.path.walk(name, _visit_pyfiles, list)
333 return list
334 elif os.path.exists(name):
335 # a single file
336 return [name]
338 return []
340 \f
341 class TokenEater:
342 def __init__(self, options):
343 self.__options = options
344 self.__messages = {}
345 self.__state = self.__waiting
346 self.__data = []
347 self.__lineno = -1
349 def __call__(self, ttype, tstring, stup, etup, line):
350 # dispatch
351 self.__state(ttype, tstring, stup[0])
353 def __waiting(self, ttype, tstring, lineno):
354 if ttype == tokenize.NAME and tstring in self.__options.keywords:
355 self.__state = self.__keywordseen
357 def __keywordseen(self, ttype, tstring, lineno):
358 if ttype == tokenize.OP and tstring == '(':
359 self.__data = []
360 self.__lineno = lineno
361 self.__state = self.__openseen
362 else:
363 self.__state = self.__waiting
365 def __openseen(self, ttype, tstring, lineno):
366 if ttype == tokenize.OP and tstring == ')':
367 # We've seen the last of the translatable strings. Record the
368 # line number of the first line of the strings and update the list
369 # of messages seen. Reset state for the next batch. If there
370 # were no strings inside _(), then just ignore this entry.
371 if self.__data:
372 msg = EMPTYSTRING.join(self.__data)
373 if not msg in self.__options.toexclude:
374 entry = (self.__curfile, self.__lineno)
375 linenos = self.__messages.get(msg)
376 if linenos is None:
377 self.__messages[msg] = [entry]
378 else:
379 linenos.append(entry)
380 self.__state = self.__waiting
381 elif ttype == tokenize.STRING:
382 self.__data.append(safe_eval(tstring))
383 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
384 token.NEWLINE, tokenize.NL]:
385 # warn if we see anything else than STRING or whitespace
386 print >>sys.stderr, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
387 'token': tstring, 'file': self.__curfile, 'lineno': self.__lineno}
388 self.__state = self.__waiting
390 def set_filename(self, filename):
391 self.__curfile = filename
393 def write(self, fp):
394 options = self.__options
395 timestamp = time.ctime(time.time())
396 # common header
397 try:
398 sys.stdout = fp
399 # The time stamp in the header doesn't have the same format
400 # as that generated by xgettext...
401 print pot_header % {'time': timestamp, 'version': __version__}
402 for k, v in self.__messages.items():
403 if not options.writelocations:
404 pass
405 # location comments are different b/w Solaris and GNU:
406 elif options.locationstyle == options.SOLARIS:
407 for filename, lineno in v:
408 d = {'filename': filename, 'lineno': lineno}
409 print _('# File: %(filename)s, line: %(lineno)d') % d
410 elif options.locationstyle == options.GNU:
411 # fit as many locations on one line, as long as the
412 # resulting line length doesn't exceeds 'options.width'
413 locline = '#:'
414 for filename, lineno in v:
415 d = {'filename': filename, 'lineno': lineno}
416 s = _(' %(filename)s:%(lineno)d') % d
417 if len(locline) + len(s) <= options.width:
418 locline = locline + s
419 else:
420 print locline
421 locline = "#:" + s
422 if len(locline) > 2:
423 print locline
424 # TBD: sorting, normalizing
425 print 'msgid', normalize(k)
426 print 'msgstr ""\n'
427 finally:
428 sys.stdout = sys.__stdout__
430 \f
431 def main():
432 global default_keywords
433 try:
434 opts, args = getopt.getopt(
435 sys.argv[1:],
436 'ad:Ehk:Kno:p:S:Vvw:x:',
437 ['extract-all', 'default-domain', 'escape', 'help',
438 'keyword=', 'no-default-keywords',
439 'add-location', 'no-location', 'output=', 'output-dir=',
440 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
441 ])
442 except getopt.error, msg:
443 usage(1, msg)
445 # for holding option values
446 class Options:
447 # constants
448 GNU = 1
449 SOLARIS = 2
450 # defaults
451 extractall = 0 # FIXME: currently this option has no effect at all.
452 escape = 0
453 keywords = []
454 outpath = ''
455 outfile = 'messages.pot'
456 writelocations = 1
457 locationstyle = GNU
458 verbose = 0
459 width = 78
460 excludefilename = ''
462 options = Options()
463 locations = {'gnu' : options.GNU,
464 'solaris' : options.SOLARIS,
465 }
467 # parse options
468 for opt, arg in opts:
469 if opt in ('-h', '--help'):
470 usage(0)
471 elif opt in ('-a', '--extract-all'):
472 options.extractall = 1
473 elif opt in ('-d', '--default-domain'):
474 options.outfile = arg + '.pot'
475 elif opt in ('-E', '--escape'):
476 options.escape = 1
477 elif opt in ('-k', '--keyword'):
478 options.keywords.append(arg)
479 elif opt in ('-K', '--no-default-keywords'):
480 default_keywords = []
481 elif opt in ('-n', '--add-location'):
482 options.writelocations = 1
483 elif opt in ('--no-location',):
484 options.writelocations = 0
485 elif opt in ('-S', '--style'):
486 options.locationstyle = locations.get(arg.lower())
487 if options.locationstyle is None:
488 usage(1, _('Invalid value for --style: %s') % arg)
489 elif opt in ('-o', '--output'):
490 options.outfile = arg
491 elif opt in ('-p', '--output-dir'):
492 options.outpath = arg
493 elif opt in ('-v', '--verbose'):
494 options.verbose = 1
495 elif opt in ('-V', '--version'):
496 print _('pygettext.py (xgettext for Python) %s') % __version__
497 sys.exit(0)
498 elif opt in ('-w', '--width'):
499 try:
500 options.width = int(arg)
501 except ValueError:
502 usage(1, _('--width argument must be an integer: %s') % arg)
503 elif opt in ('-x', '--exclude-file'):
504 options.excludefilename = arg
506 # calculate escapes
507 make_escapes(options.escape)
509 # calculate all keywords
510 options.keywords.extend(default_keywords)
512 # initialize list of strings to exclude
513 if options.excludefilename:
514 try:
515 fp = open(options.excludefilename)
516 options.toexclude = fp.readlines()
517 fp.close()
518 except IOError:
519 sys.stderr.write(_("Can't read --exclude-file: %s") %
520 options.excludefilename)
521 sys.exit(1)
522 else:
523 options.toexclude = []
525 # resolve args to module lists
526 expanded = []
527 for arg in args:
528 expanded.extend(getFilesForName(arg))
529 args = expanded
531 # slurp through all the files
532 eater = TokenEater(options)
533 for filename in args:
534 if filename == '-':
535 if options.verbose:
536 print _('Reading standard input')
537 fp = sys.stdin
538 closep = 0
539 else:
540 if options.verbose:
541 print _('Working on %s') % filename
542 fp = open(filename)
543 closep = 1
544 try:
545 eater.set_filename(filename)
546 tokenize.tokenize(fp.readline, eater)
547 finally:
548 if closep:
549 fp.close()
551 # write the output
552 if options.outfile == '-':
553 fp = sys.stdout
554 closep = 0
555 else:
556 if options.outpath:
557 options.outfile = os.path.join(options.outpath, options.outfile)
558 fp = open(options.outfile, 'w')
559 closep = 1
560 try:
561 eater.write(fp)
562 finally:
563 if closep:
564 fp.close()
566 \f
567 if __name__ == '__main__':
568 main()
569 # some more test strings
570 _(u'a unicode string')
571 _('*** Seen unexpected token "%(token)s"' % {'token': 'test'})
572 _('more' 'than' 'one' 'string')