1 #! /usr/bin/env python
2 # Originally written by Barry Warsaw <barry@zope.com>
3 #
4 # Minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
6 #
7 # 2001-12-18 Jürgen Hermann <jh@web.de>
8 # Added checks that _() only contains string literals, and
9 # command line args are resolved to module lists, i.e. you
10 # can now pass a filename, a module or package name, or a
11 # directory (including globbing chars, important for Win32).
12 # Made docstring fit in 80 chars wide displays using pydoc.
13 #
15 # for selftesting
16 try:
17 import fintl
18 _ = fintl.gettext
19 except ImportError:
20 _ = lambda s: s
22 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
25 internationalization of C programs. Most of these tools are independent of
26 the programming language and can be used from within Python programs.
27 Martin von Loewis' work[1] helps considerably in this regard.
29 There's one problem though; xgettext is the program that scans source code
30 looking for message strings, but it groks only C (or C++). Python
31 introduces a few wrinkles, such as dual quoting characters, triple quoted
32 strings, and raw strings. xgettext understands none of this.
34 Enter pygettext, which uses Python's standard tokenize module to scan
35 Python source code, generating .pot files identical to what GNU xgettext[2]
36 generates for C and C++ code. From there, the standard GNU tools can be
37 used.
39 A word about marking Python strings as candidates for translation. GNU
40 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
41 and gettext_noop. But those can be a lot of text to include all over your
42 code. C and C++ have a trick: they use the C preprocessor. Most
43 internationalized C source includes a #define for gettext() to _() so that
44 what has to be written in the source is much less. Thus these are both
45 translatable strings:
47 gettext("Translatable String")
48 _("Translatable String")
50 Python of course has no preprocessor so this doesn't work so well. Thus,
51 pygettext searches only for _() by default, but see the -k/--keyword flag
52 below for how to augment this.
54 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
55 [2] http://www.gnu.org/software/gettext/gettext.html
57 NOTE: pygettext attempts to be option and feature compatible with GNU
58 xgettext where ever possible. However some options are still missing or are
59 not fully implemented. Also, xgettext's use of command line switches with
60 option arguments is broken, and in these cases, pygettext just defines
61 additional switches.
63 Usage: pygettext [options] inputfile ...
65 Options:
67 -a
68 --extract-all
69 Extract all strings.
71 -d name
72 --default-domain=name
73 Rename the default output file from messages.pot to name.pot.
75 -E
76 --escape
77 Replace non-ASCII characters with octal escape sequences.
79 -D
80 --docstrings
81 Extract module, class, method, and function docstrings. These do
82 not need to be wrapped in _() markers, and in fact cannot be for
83 Python to consider them docstrings. (See also the -X option).
85 -h
86 --help
87 Print this help message and exit.
89 -k word
90 --keyword=word
91 Keywords to look for in addition to the default set, which are:
92 %(DEFAULTKEYWORDS)s
94 You can have multiple -k flags on the command line.
96 -K
97 --no-default-keywords
98 Disable the default set of keywords (see above). Any keywords
99 explicitly added with the -k/--keyword option are still recognized.
101 --no-location
102 Do not write filename/lineno location comments.
104 -n
105 --add-location
106 Write filename/lineno location comments indicating where each
107 extracted string is found in the source. These lines appear before
108 each msgid. The style of comments is controlled by the -S/--style
109 option. This is the default.
111 -o filename
112 --output=filename
113 Rename the default output file from messages.pot to filename. If
114 filename is `-' then the output is sent to standard out.
116 -p dir
117 --output-dir=dir
118 Output files will be placed in directory dir.
120 -S stylename
121 --style stylename
122 Specify which style to use for location comments. Two styles are
123 supported:
125 Solaris # File: filename, line: line-number
126 GNU #: filename:line
128 The style name is case insensitive. GNU style is the default.
130 -v
131 --verbose
132 Print the names of the files being processed.
134 -V
135 --version
136 Print the version of pygettext and exit.
138 -w columns
139 --width=columns
140 Set width of output to columns.
142 -x filename
143 --exclude-file=filename
144 Specify a file that contains a list of strings that are not be
145 extracted from the input files. Each string to be excluded must
146 appear on a line by itself in the file.
148 -X filename
149 --no-docstrings=filename
150 Specify a file that contains a list of files (one per line) that
151 should not have their docstrings extracted. This is only useful in
152 conjunction with the -D option above.
154 If `inputfile' is -, standard input is read.
155 """)
157 import os
158 import sys
159 import time
160 import getopt
161 import token
162 import tokenize
163 import operator
165 __version__ = '1.5'
167 default_keywords = ['_']
168 DEFAULTKEYWORDS = ', '.join(default_keywords)
170 EMPTYSTRING = ''
173 \f
174 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
175 # there.
176 pot_header = _('''\
177 # SOME DESCRIPTIVE TITLE.
178 # Copyright (C) YEAR ORGANIZATION
179 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
180 #
181 msgid ""
182 msgstr ""
183 "Project-Id-Version: PACKAGE VERSION\\n"
184 "POT-Creation-Date: %(time)s\\n"
185 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
186 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
187 "Language-Team: LANGUAGE <LL@li.org>\\n"
188 "MIME-Version: 1.0\\n"
189 "Content-Type: text/plain; charset=CHARSET\\n"
190 "Content-Transfer-Encoding: ENCODING\\n"
191 "Generated-By: pygettext.py %(version)s\\n"
193 ''')
195 \f
196 def usage(code, msg=''):
197 print >> sys.stderr, __doc__ % globals()
198 if msg:
199 print >> sys.stderr, msg
200 sys.exit(code)
203 \f
204 escapes = []
206 def make_escapes(pass_iso8859):
207 global escapes
208 if pass_iso8859:
209 # Allow iso-8859 characters to pass through so that e.g. 'msgid
210 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
211 # escape any character outside the 32..126 range.
212 mod = 128
213 else:
214 mod = 256
215 for i in range(256):
216 if 32 <= (i % mod) <= 126:
217 escapes.append(chr(i))
218 else:
219 escapes.append("\\%03o" % i)
220 escapes[ord('\\')] = '\\\\'
221 escapes[ord('\t')] = '\\t'
222 escapes[ord('\r')] = '\\r'
223 escapes[ord('\n')] = '\\n'
224 escapes[ord('\"')] = '\\"'
227 def escape(s):
228 global escapes
229 s = list(s)
230 for i in range(len(s)):
231 s[i] = escapes[ord(s[i])]
232 return EMPTYSTRING.join(s)
235 def safe_eval(s):
236 # unwrap quotes, safely
237 return eval(s, {'__builtins__':{}}, {})
240 def normalize(s):
241 # This converts the various Python string types into a format that is
242 # appropriate for .po files, namely much closer to C style.
243 lines = s.split('\n')
244 if len(lines) == 1:
245 s = '"' + escape(s) + '"'
246 else:
247 if not lines[-1]:
248 del lines[-1]
249 lines[-1] = lines[-1] + '\n'
250 for i in range(len(lines)):
251 lines[i] = escape(lines[i])
252 lineterm = '\\n"\n"'
253 s = '""\n"' + lineterm.join(lines) + '"'
254 return s
256 \f
257 def containsAny(str, set):
258 """ Check whether 'str' contains ANY of the chars in 'set'
259 """
260 return 1 in [c in str for c in set]
263 def _visit_pyfiles(list, dirname, names):
264 """ Helper for getFilesForName().
265 """
266 # get extension for python source files
267 if not globals().has_key('_py_ext'):
268 import imp
269 global _py_ext
270 _py_ext = [triple[0] for triple in imp.get_suffixes() if triple[2] == imp.PY_SOURCE][0]
272 # don't recurse into CVS directories
273 if 'CVS' in names:
274 names.remove('CVS')
276 # add all *.py files to list
277 list.extend(
278 [os.path.join(dirname, file)
279 for file in names
280 if os.path.splitext(file)[1] == _py_ext])
283 def _get_modpkg_path(dotted_name, pathlist=None):
284 """ Get the filesystem path for a module or a package.
286 Return the file system path to a file for a module,
287 and to a directory for a package. Return None if
288 the name is not found, or is a builtin or extension module.
289 """
290 import imp
292 # split off top-most name
293 parts = dotted_name.split('.', 1)
295 if len(parts) > 1:
296 # we have a dotted path, import top-level package
297 try:
298 file, pathname, description = imp.find_module(parts[0], pathlist)
299 if file: file.close()
300 except ImportError:
301 return None
303 # check if it's indeed a package
304 if description[2] == imp.PKG_DIRECTORY:
305 # recursively handle the remaining name parts
306 pathname = _get_modpkg_path(parts[1], [pathname])
307 else:
308 pathname = None
309 else:
310 # plain name
311 try:
312 file, pathname, description = imp.find_module(dotted_name, pathlist)
313 if file: file.close()
314 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
315 pathname = None
316 except ImportError:
317 pathname = None
319 return pathname
322 def getFilesForName(name):
323 """ Get a list of module files for a filename, a module or package name,
324 or a directory.
325 """
326 import imp
328 if not os.path.exists(name):
329 # check for glob chars
330 if containsAny(name, "*?[]"):
331 import glob
332 files = glob.glob(name)
333 list = []
334 for file in files:
335 list.extend(getFilesForName(file))
336 return list
338 # try to find module or package
339 name = _get_modpkg_path(name)
340 if not name:
341 return []
343 if os.path.isdir(name):
344 # find all python files in directory
345 list = []
346 os.path.walk(name, _visit_pyfiles, list)
347 return list
348 elif os.path.exists(name):
349 # a single file
350 return [name]
352 return []
354 \f
355 class TokenEater:
356 def __init__(self, options):
357 self.__options = options
358 self.__messages = {}
359 self.__state = self.__waiting
360 self.__data = []
361 self.__lineno = -1
362 self.__freshmodule = 1
363 self.__curfile = None
365 def __call__(self, ttype, tstring, stup, etup, line):
366 # dispatch
367 ## import token
368 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
369 ## 'tstring:', tstring
370 self.__state(ttype, tstring, stup[0])
372 def __waiting(self, ttype, tstring, lineno):
373 opts = self.__options
374 # Do docstring extractions, if enabled
375 if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
376 # module docstring?
377 if self.__freshmodule:
378 if ttype == tokenize.STRING:
379 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
380 self.__freshmodule = 0
381 elif ttype not in (tokenize.COMMENT, tokenize.NL):
382 self.__freshmodule = 0
383 return
384 # class docstring?
385 if ttype == tokenize.NAME and tstring in ('class', 'def'):
386 self.__state = self.__suiteseen
387 return
388 if ttype == tokenize.NAME and tstring in opts.keywords:
389 self.__state = self.__keywordseen
391 def __suiteseen(self, ttype, tstring, lineno):
392 # ignore anything until we see the colon
393 if ttype == tokenize.OP and tstring == ':':
394 self.__state = self.__suitedocstring
396 def __suitedocstring(self, ttype, tstring, lineno):
397 # ignore any intervening noise
398 if ttype == tokenize.STRING:
399 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
400 self.__state = self.__waiting
401 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
402 tokenize.COMMENT):
403 # there was no class docstring
404 self.__state = self.__waiting
406 def __keywordseen(self, ttype, tstring, lineno):
407 if ttype == tokenize.OP and tstring == '(':
408 self.__data = []
409 self.__lineno = lineno
410 self.__state = self.__openseen
411 else:
412 self.__state = self.__waiting
414 def __openseen(self, ttype, tstring, lineno):
415 if ttype == tokenize.OP and tstring == ')':
416 # We've seen the last of the translatable strings. Record the
417 # line number of the first line of the strings and update the list
418 # of messages seen. Reset state for the next batch. If there
419 # were no strings inside _(), then just ignore this entry.
420 if self.__data:
421 self.__addentry(EMPTYSTRING.join(self.__data))
422 self.__state = self.__waiting
423 elif ttype == tokenize.STRING:
424 self.__data.append(safe_eval(tstring))
425 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
426 token.NEWLINE, tokenize.NL]:
427 # warn if we see anything else than STRING or whitespace
428 print >>sys.stderr, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
429 'token': tstring, 'file': self.__curfile, 'lineno': self.__lineno}
430 self.__state = self.__waiting
432 def __addentry(self, msg, lineno=None, isdocstring=0):
433 if lineno is None:
434 lineno = self.__lineno
435 if not msg in self.__options.toexclude:
436 entry = (self.__curfile, lineno)
437 self.__messages.setdefault(msg, {})[entry] = isdocstring
439 def set_filename(self, filename):
440 self.__curfile = filename
441 self.__freshmodule = 1
443 def write(self, fp):
444 options = self.__options
445 timestamp = time.ctime(time.time())
446 # The time stamp in the header doesn't have the same format as that
447 # generated by xgettext...
448 print >> fp, pot_header % {'time': timestamp, 'version': __version__}
449 # Sort the entries. First sort each particular entry's keys, then
450 # sort all the entries by their first item.
451 reverse = {}
452 for k, v in self.__messages.items():
453 keys = v.keys()
454 keys.sort()
455 reverse.setdefault(tuple(keys), []).append((k, v))
456 rkeys = reverse.keys()
457 rkeys.sort()
458 for rkey in rkeys:
459 rentries = reverse[rkey]
460 rentries.sort()
461 for k, v in rentries:
462 isdocstring = 0
463 # If the entry was gleaned out of a docstring, then add a
464 # comment stating so. This is to aid translators who may wish
465 # to skip translating some unimportant docstrings.
466 if reduce(operator.__add__, v.values()):
467 isdocstring = 1
468 # k is the message string, v is a dictionary-set of (filename,
469 # lineno) tuples. We want to sort the entries in v first by
470 # file name and then by line number.
471 v = v.keys()
472 v.sort()
473 if not options.writelocations:
474 pass
475 # location comments are different b/w Solaris and GNU:
476 elif options.locationstyle == options.SOLARIS:
477 for filename, lineno in v:
478 d = {'filename': filename, 'lineno': lineno}
479 print >>fp, _(
480 '# File: %(filename)s, line: %(lineno)d') % d
481 elif options.locationstyle == options.GNU:
482 # fit as many locations on one line, as long as the
483 # resulting line length doesn't exceeds 'options.width'
484 locline = '#:'
485 for filename, lineno in v:
486 d = {'filename': filename, 'lineno': lineno}
487 s = _(' %(filename)s:%(lineno)d') % d
488 if len(locline) + len(s) <= options.width:
489 locline = locline + s
490 else:
491 print >> fp, locline
492 locline = "#:" + s
493 if len(locline) > 2:
494 print >> fp, locline
495 if isdocstring:
496 print >> fp, '#, docstring'
497 print >> fp, 'msgid', normalize(k)
498 print >> fp, 'msgstr ""\n'
501 \f
502 def main():
503 global default_keywords
504 try:
505 opts, args = getopt.getopt(
506 sys.argv[1:],
507 'ad:DEhk:Kno:p:S:Vvw:x:X:',
508 ['extract-all', 'default-domain=', 'escape', 'help',
509 'keyword=', 'no-default-keywords',
510 'add-location', 'no-location', 'output=', 'output-dir=',
511 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
512 'docstrings', 'no-docstrings',
513 ])
514 except getopt.error, msg:
515 usage(1, msg)
517 # for holding option values
518 class Options:
519 # constants
520 GNU = 1
521 SOLARIS = 2
522 # defaults
523 extractall = 0 # FIXME: currently this option has no effect at all.
524 escape = 0
525 keywords = []
526 outpath = ''
527 outfile = 'messages.pot'
528 writelocations = 1
529 locationstyle = GNU
530 verbose = 0
531 width = 78
532 excludefilename = ''
533 docstrings = 0
534 nodocstrings = {}
536 options = Options()
537 locations = {'gnu' : options.GNU,
538 'solaris' : options.SOLARIS,
539 }
541 # parse options
542 for opt, arg in opts:
543 if opt in ('-h', '--help'):
544 usage(0)
545 elif opt in ('-a', '--extract-all'):
546 options.extractall = 1
547 elif opt in ('-d', '--default-domain'):
548 options.outfile = arg + '.pot'
549 elif opt in ('-E', '--escape'):
550 options.escape = 1
551 elif opt in ('-D', '--docstrings'):
552 options.docstrings = 1
553 elif opt in ('-k', '--keyword'):
554 options.keywords.append(arg)
555 elif opt in ('-K', '--no-default-keywords'):
556 default_keywords = []
557 elif opt in ('-n', '--add-location'):
558 options.writelocations = 1
559 elif opt in ('--no-location',):
560 options.writelocations = 0
561 elif opt in ('-S', '--style'):
562 options.locationstyle = locations.get(arg.lower())
563 if options.locationstyle is None:
564 usage(1, _('Invalid value for --style: %s') % arg)
565 elif opt in ('-o', '--output'):
566 options.outfile = arg
567 elif opt in ('-p', '--output-dir'):
568 options.outpath = arg
569 elif opt in ('-v', '--verbose'):
570 options.verbose = 1
571 elif opt in ('-V', '--version'):
572 print _('pygettext.py (xgettext for Python) %s') % __version__
573 sys.exit(0)
574 elif opt in ('-w', '--width'):
575 try:
576 options.width = int(arg)
577 except ValueError:
578 usage(1, _('--width argument must be an integer: %s') % arg)
579 elif opt in ('-x', '--exclude-file'):
580 options.excludefilename = arg
581 elif opt in ('-X', '--no-docstrings'):
582 fp = open(arg)
583 try:
584 while 1:
585 line = fp.readline()
586 if not line:
587 break
588 options.nodocstrings[line[:-1]] = 1
589 finally:
590 fp.close()
592 # calculate escapes
593 make_escapes(options.escape)
595 # calculate all keywords
596 options.keywords.extend(default_keywords)
598 # initialize list of strings to exclude
599 if options.excludefilename:
600 try:
601 fp = open(options.excludefilename)
602 options.toexclude = fp.readlines()
603 fp.close()
604 except IOError:
605 print >> sys.stderr, _(
606 "Can't read --exclude-file: %s") % options.excludefilename
607 sys.exit(1)
608 else:
609 options.toexclude = []
611 # resolve args to module lists
612 expanded = []
613 for arg in args:
614 if arg == '-':
615 expanded.append(arg)
616 else:
617 expanded.extend(getFilesForName(arg))
618 args = expanded
620 # slurp through all the files
621 eater = TokenEater(options)
622 for filename in args:
623 if filename == '-':
624 if options.verbose:
625 print _('Reading standard input')
626 fp = sys.stdin
627 closep = 0
628 else:
629 if options.verbose:
630 print _('Working on %s') % filename
631 fp = open(filename)
632 closep = 1
633 try:
634 eater.set_filename(filename)
635 try:
636 tokenize.tokenize(fp.readline, eater)
637 except tokenize.TokenError, e:
638 print >> sys.stderr, '%s: %s, line %d, column %d' % (
639 e[0], filename, e[1][0], e[1][1])
640 finally:
641 if closep:
642 fp.close()
644 # write the output
645 if options.outfile == '-':
646 fp = sys.stdout
647 closep = 0
648 else:
649 if options.outpath:
650 options.outfile = os.path.join(options.outpath, options.outfile)
651 fp = open(options.outfile, 'w')
652 closep = 1
653 try:
654 eater.write(fp)
655 finally:
656 if closep:
657 fp.close()
659 \f
660 if __name__ == '__main__':
661 main()
662 # some more test strings
663 _(u'a unicode string')
664 _('*** Seen unexpected token "%(token)s"' % {'token': 'test'}) # this one creates a warning
665 _('more' 'than' 'one' 'string')