tools/pygettext.py

   1 #! /usr/bin/env python
   2 # Originally written by Barry Warsaw <barry@zope.com>
   3 #
   4 # Minimally patched to make it even more xgettext compatible
   5 # by Peter Funk <pf@artcom-gmbh.de>
   6 #
   7 # 2001-12-18 Jürgen Hermann <jh@web.de>
   8 # Added checks that _() only contains string literals, and
   9 # command line args are resolved to module lists, i.e. you
  10 # can now pass a filename, a module or package name, or a
  11 # directory (including globbing chars, important for Win32).
  12 # Made docstring fit in 80 chars wide displays using pydoc.
  13 #
  14
  15 # for selftesting
  16 try:
  17     import fintl
  18     _ = fintl.gettext
  19 except ImportError:
  20     _ = lambda s: s
  21
  22 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
  23
  24 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
  25 internationalization of C programs. Most of these tools are independent of
  26 the programming language and can be used from within Python programs.
  27 Martin von Loewis' work[1] helps considerably in this regard.
  28
  29 There's one problem though; xgettext is the program that scans source code
  30 looking for message strings, but it groks only C (or C++). Python
  31 introduces a few wrinkles, such as dual quoting characters, triple quoted
  32 strings, and raw strings. xgettext understands none of this.
  33
  34 Enter pygettext, which uses Python's standard tokenize module to scan
  35 Python source code, generating .pot files identical to what GNU xgettext[2]
  36 generates for C and C++ code. From there, the standard GNU tools can be
  37 used.
  38
  39 A word about marking Python strings as candidates for translation. GNU
  40 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
  41 and gettext_noop. But those can be a lot of text to include all over your
  42 code. C and C++ have a trick: they use the C preprocessor. Most
  43 internationalized C source includes a #define for gettext() to _() so that
  44 what has to be written in the source is much less. Thus these are both
  45 translatable strings:
  46
  47     gettext("Translatable String")
  48     _("Translatable String")
  49
  50 Python of course has no preprocessor so this doesn't work so well.  Thus,
  51 pygettext searches only for _() by default, but see the -k/--keyword flag
  52 below for how to augment this.
  53
  54  [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
  55  [2] http://www.gnu.org/software/gettext/gettext.html
  56
  57 NOTE: pygettext attempts to be option and feature compatible with GNU
  58 xgettext where ever possible. However some options are still missing or are
  59 not fully implemented. Also, xgettext's use of command line switches with
  60 option arguments is broken, and in these cases, pygettext just defines
  61 additional switches.
  62
  63 Usage: pygettext [options] inputfile ...
  64
  65 Options:
  66
  67     -a
  68     --extract-all
  69         Extract all strings.
  70
  71     -d name
  72     --default-domain=name
  73         Rename the default output file from messages.pot to name.pot.
  74
  75     -E
  76     --escape
  77         Replace non-ASCII characters with octal escape sequences.
  78
  79     -D
  80     --docstrings
  81         Extract module, class, method, and function docstrings.  These do
  82         not need to be wrapped in _() markers, and in fact cannot be for
  83         Python to consider them docstrings. (See also the -X option).
  84
  85     -h
  86     --help
  87         Print this help message and exit.
  88
  89     -k word
  90     --keyword=word
  91         Keywords to look for in addition to the default set, which are:
  92         %(DEFAULTKEYWORDS)s
  93
  94         You can have multiple -k flags on the command line.
  95
  96     -K
  97     --no-default-keywords
  98         Disable the default set of keywords (see above).  Any keywords
  99         explicitly added with the -k/--keyword option are still recognized.
 100
 101     --no-location
 102         Do not write filename/lineno location comments.
 103
 104     -n
 105     --add-location
 106         Write filename/lineno location comments indicating where each
 107         extracted string is found in the source.  These lines appear before
 108         each msgid.  The style of comments is controlled by the -S/--style
 109         option.  This is the default.
 110
 111     -o filename
 112     --output=filename
 113         Rename the default output file from messages.pot to filename.  If
 114         filename is `-' then the output is sent to standard out.
 115
 116     -p dir
 117     --output-dir=dir
 118         Output files will be placed in directory dir.
 119
 120     -S stylename
 121     --style stylename
 122         Specify which style to use for location comments.  Two styles are
 123         supported:
 124
 125         Solaris  # File: filename, line: line-number
 126         GNU      #: filename:line
 127
 128         The style name is case insensitive.  GNU style is the default.
 129
 130     -v
 131     --verbose
 132         Print the names of the files being processed.
 133
 134     -V
 135     --version
 136         Print the version of pygettext and exit.
 137
 138     -w columns
 139     --width=columns
 140         Set width of output to columns.
 141
 142     -x filename
 143     --exclude-file=filename
 144         Specify a file that contains a list of strings that are not be
 145         extracted from the input files.  Each string to be excluded must
 146         appear on a line by itself in the file.
 147
 148     -X filename
 149     --no-docstrings=filename
 150         Specify a file that contains a list of files (one per line) that
 151         should not have their docstrings extracted.  This is only useful in
 152         conjunction with the -D option above.
 153
 154 If `inputfile' is -, standard input is read.
 155 """)
 156
 157 import os
 158 import sys
 159 import time
 160 import getopt
 161 import token
 162 import tokenize
 163 import operator
 164
 165 __version__ = '1.5'
 166
 167 default_keywords = ['_']
 168 DEFAULTKEYWORDS = ', '.join(default_keywords)
 169
 170 EMPTYSTRING = ''
 171
 172
 173 \f
 174 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
 175 # there.
 176 pot_header = _('''\
 177 # SOME DESCRIPTIVE TITLE.
 178 # Copyright (C) YEAR ORGANIZATION
 179 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
 180 #
 181 msgid ""
 182 msgstr ""
 183 "Project-Id-Version: PACKAGE VERSION\\n"
 184 "POT-Creation-Date: %(time)s\\n"
 185 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 186 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 187 "Language-Team: LANGUAGE <LL@li.org>\\n"
 188 "MIME-Version: 1.0\\n"
 189 "Content-Type: text/plain; charset=CHARSET\\n"
 190 "Content-Transfer-Encoding: ENCODING\\n"
 191 "Generated-By: pygettext.py %(version)s\\n"
 192
 193 ''')
 194
 195 \f
 196 def usage(code, msg=''):
 197     print >> sys.stderr, __doc__ % globals()
 198     if msg:
 199         print >> sys.stderr, msg
 200     sys.exit(code)
 201
 202
 203 \f
 204 escapes = []
 205
 206 def make_escapes(pass_iso8859):
 207     global escapes
 208     if pass_iso8859:
 209         # Allow iso-8859 characters to pass through so that e.g. 'msgid
 210         # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
 211         # escape any character outside the 32..126 range.
 212         mod = 128
 213     else:
 214         mod = 256
 215     for i in range(256):
 216         if 32 <= (i % mod) <= 126:
 217             escapes.append(chr(i))
 218         else:
 219             escapes.append("\\%03o" % i)
 220     escapes[ord('\\')] = '\\\\'
 221     escapes[ord('\t')] = '\\t'
 222     escapes[ord('\r')] = '\\r'
 223     escapes[ord('\n')] = '\\n'
 224     escapes[ord('\"')] = '\\"'
 225
 226
 227 def escape(s):
 228     global escapes
 229     s = list(s)
 230     for i in range(len(s)):
 231         s[i] = escapes[ord(s[i])]
 232     return EMPTYSTRING.join(s)
 233
 234
 235 def safe_eval(s):
 236     # unwrap quotes, safely
 237     return eval(s, {'__builtins__':{}}, {})
 238
 239
 240 def normalize(s):
 241     # This converts the various Python string types into a format that is
 242     # appropriate for .po files, namely much closer to C style.
 243     lines = s.split('\n')
 244     if len(lines) == 1:
 245         s = '"' + escape(s) + '"'
 246     else:
 247         if not lines[-1]:
 248             del lines[-1]
 249             lines[-1] = lines[-1] + '\n'
 250         for i in range(len(lines)):
 251             lines[i] = escape(lines[i])
 252         lineterm = '\\n"\n"'
 253         s = '""\n"' + lineterm.join(lines) + '"'
 254     return s
 255
 256 \f
 257 def containsAny(str, set):
 258     """ Check whether 'str' contains ANY of the chars in 'set'
 259     """
 260     return 1 in [c in str for c in set]
 261
 262
 263 def _visit_pyfiles(list, dirname, names):
 264     """ Helper for getFilesForName().
 265     """
 266     # get extension for python source files
 267     if not globals().has_key('_py_ext'):
 268         import imp
 269         global _py_ext
 270         _py_ext = [triple[0] for triple in imp.get_suffixes() if triple[2] == imp.PY_SOURCE][0]
 271
 272     # don't recurse into CVS directories
 273     if 'CVS' in names:
 274         names.remove('CVS')
 275
 276     # add all *.py files to list
 277     list.extend(
 278         [os.path.join(dirname, file)
 279             for file in names
 280                 if os.path.splitext(file)[1] == _py_ext])
 281
 282
 283 def _get_modpkg_path(dotted_name, pathlist=None):
 284     """ Get the filesystem path for a module or a package.
 285
 286         Return the file system path to a file for a module,
 287         and to a directory for a package. Return None if
 288         the name is not found, or is a builtin or extension module.
 289     """
 290     import imp
 291
 292     # split off top-most name
 293     parts = dotted_name.split('.', 1)
 294
 295     if len(parts) > 1:
 296         # we have a dotted path, import top-level package
 297         try:
 298             file, pathname, description = imp.find_module(parts[0], pathlist)
 299             if file: file.close()
 300         except ImportError:
 301             return None
 302
 303         # check if it's indeed a package
 304         if description[2] == imp.PKG_DIRECTORY:
 305             # recursively handle the remaining name parts
 306             pathname = _get_modpkg_path(parts[1], [pathname])
 307         else:
 308             pathname = None
 309     else:
 310         # plain name
 311         try:
 312             file, pathname, description = imp.find_module(dotted_name, pathlist)
 313             if file: file.close()
 314             if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
 315                 pathname = None
 316         except ImportError:
 317             pathname = None
 318
 319     return pathname
 320
 321
 322 def getFilesForName(name):
 323     """ Get a list of module files for a filename, a module or package name,
 324         or a directory.
 325     """
 326     import imp
 327
 328     if not os.path.exists(name):
 329         # check for glob chars
 330         if containsAny(name, "*?[]"):
 331             import glob
 332             files = glob.glob(name)
 333             list = []
 334             for file in files:
 335                 list.extend(getFilesForName(file))
 336             return list
 337
 338         # try to find module or package
 339         name = _get_modpkg_path(name)
 340         if not name:
 341             return []
 342
 343     if os.path.isdir(name):
 344         # find all python files in directory
 345         list = []
 346         os.path.walk(name, _visit_pyfiles, list)
 347         return list
 348     elif os.path.exists(name):
 349         # a single file
 350         return [name]
 351
 352     return []
 353
 354 \f
 355 class TokenEater:
 356     def __init__(self, options):
 357         self.__options = options
 358         self.__messages = {}
 359         self.__state = self.__waiting
 360         self.__data = []
 361         self.__lineno = -1
 362         self.__freshmodule = 1
 363         self.__curfile = None
 364
 365     def __call__(self, ttype, tstring, stup, etup, line):
 366         # dispatch
 367 ##        import token
 368 ##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
 369 ##              'tstring:', tstring
 370         self.__state(ttype, tstring, stup[0])
 371
 372     def __waiting(self, ttype, tstring, lineno):
 373         opts = self.__options
 374         # Do docstring extractions, if enabled
 375         if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
 376             # module docstring?
 377             if self.__freshmodule:
 378                 if ttype == tokenize.STRING:
 379                     self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
 380                     self.__freshmodule = 0
 381                 elif ttype not in (tokenize.COMMENT, tokenize.NL):
 382                     self.__freshmodule = 0
 383                 return
 384             # class docstring?
 385             if ttype == tokenize.NAME and tstring in ('class', 'def'):
 386                 self.__state = self.__suiteseen
 387                 return
 388         if ttype == tokenize.NAME and tstring in opts.keywords:
 389             self.__state = self.__keywordseen
 390
 391     def __suiteseen(self, ttype, tstring, lineno):
 392         # ignore anything until we see the colon
 393         if ttype == tokenize.OP and tstring == ':':
 394             self.__state = self.__suitedocstring
 395
 396     def __suitedocstring(self, ttype, tstring, lineno):
 397         # ignore any intervening noise
 398         if ttype == tokenize.STRING:
 399             self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
 400             self.__state = self.__waiting
 401         elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
 402                            tokenize.COMMENT):
 403             # there was no class docstring
 404             self.__state = self.__waiting
 405
 406     def __keywordseen(self, ttype, tstring, lineno):
 407         if ttype == tokenize.OP and tstring == '(':
 408             self.__data = []
 409             self.__lineno = lineno
 410             self.__state = self.__openseen
 411         else:
 412             self.__state = self.__waiting
 413
 414     def __openseen(self, ttype, tstring, lineno):
 415         if ttype == tokenize.OP and tstring == ')':
 416             # We've seen the last of the translatable strings.  Record the
 417             # line number of the first line of the strings and update the list
 418             # of messages seen.  Reset state for the next batch.  If there
 419             # were no strings inside _(), then just ignore this entry.
 420             if self.__data:
 421                 self.__addentry(EMPTYSTRING.join(self.__data))
 422             self.__state = self.__waiting
 423         elif ttype == tokenize.STRING:
 424             self.__data.append(safe_eval(tstring))
 425         elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
 426                            token.NEWLINE, tokenize.NL]:
 427             # warn if we see anything else than STRING or whitespace
 428             print >>sys.stderr, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
 429                 'token': tstring, 'file': self.__curfile, 'lineno': self.__lineno}
 430             self.__state = self.__waiting
 431
 432     def __addentry(self, msg, lineno=None, isdocstring=0):
 433         if lineno is None:
 434             lineno = self.__lineno
 435         if not msg in self.__options.toexclude:
 436             entry = (self.__curfile, lineno)
 437             self.__messages.setdefault(msg, {})[entry] = isdocstring
 438
 439     def set_filename(self, filename):
 440         self.__curfile = filename
 441         self.__freshmodule = 1
 442
 443     def write(self, fp):
 444         options = self.__options
 445         timestamp = time.ctime(time.time())
 446         # The time stamp in the header doesn't have the same format as that
 447         # generated by xgettext...
 448         print >> fp, pot_header % {'time': timestamp, 'version': __version__}
 449         # Sort the entries.  First sort each particular entry's keys, then
 450         # sort all the entries by their first item.
 451         reverse = {}
 452         for k, v in self.__messages.items():
 453             keys = v.keys()
 454             keys.sort()
 455             reverse.setdefault(tuple(keys), []).append((k, v))
 456         rkeys = reverse.keys()
 457         rkeys.sort()
 458         for rkey in rkeys:
 459             rentries = reverse[rkey]
 460             rentries.sort()
 461             for k, v in rentries:
 462                 isdocstring = 0
 463                 # If the entry was gleaned out of a docstring, then add a
 464                 # comment stating so.  This is to aid translators who may wish
 465                 # to skip translating some unimportant docstrings.
 466                 if reduce(operator.__add__, v.values()):
 467                     isdocstring = 1
 468                 # k is the message string, v is a dictionary-set of (filename,
 469                 # lineno) tuples.  We want to sort the entries in v first by
 470                 # file name and then by line number.
 471                 v = v.keys()
 472                 v.sort()
 473                 if not options.writelocations:
 474                     pass
 475                 # location comments are different b/w Solaris and GNU:
 476                 elif options.locationstyle == options.SOLARIS:
 477                     for filename, lineno in v:
 478                         d = {'filename': filename, 'lineno': lineno}
 479                         print >>fp, _(
 480                             '# File: %(filename)s, line: %(lineno)d') % d
 481                 elif options.locationstyle == options.GNU:
 482                     # fit as many locations on one line, as long as the
 483                     # resulting line length doesn't exceeds 'options.width'
 484                     locline = '#:'
 485                     for filename, lineno in v:
 486                         d = {'filename': filename, 'lineno': lineno}
 487                         s = _(' %(filename)s:%(lineno)d') % d
 488                         if len(locline) + len(s) <= options.width:
 489                             locline = locline + s
 490                         else:
 491                             print >> fp, locline
 492                             locline = "#:" + s
 493                     if len(locline) > 2:
 494                         print >> fp, locline
 495                 if isdocstring:
 496                     print >> fp, '#, docstring'
 497                 print >> fp, 'msgid', normalize(k)
 498                 print >> fp, 'msgstr ""\n'
 499
 500
 501 \f
 502 def main():
 503     global default_keywords
 504     try:
 505         opts, args = getopt.getopt(
 506             sys.argv[1:],
 507             'ad:DEhk:Kno:p:S:Vvw:x:X:',
 508             ['extract-all', 'default-domain=', 'escape', 'help',
 509              'keyword=', 'no-default-keywords',
 510              'add-location', 'no-location', 'output=', 'output-dir=',
 511              'style=', 'verbose', 'version', 'width=', 'exclude-file=',
 512              'docstrings', 'no-docstrings',
 513              ])
 514     except getopt.error, msg:
 515         usage(1, msg)
 516
 517     # for holding option values
 518     class Options:
 519         # constants
 520         GNU = 1
 521         SOLARIS = 2
 522         # defaults
 523         extractall = 0 # FIXME: currently this option has no effect at all.
 524         escape = 0
 525         keywords = []
 526         outpath = ''
 527         outfile = 'messages.pot'
 528         writelocations = 1
 529         locationstyle = GNU
 530         verbose = 0
 531         width = 78
 532         excludefilename = ''
 533         docstrings = 0
 534         nodocstrings = {}
 535
 536     options = Options()
 537     locations = {'gnu' : options.GNU,
 538                  'solaris' : options.SOLARIS,
 539                  }
 540
 541     # parse options
 542     for opt, arg in opts:
 543         if opt in ('-h', '--help'):
 544             usage(0)
 545         elif opt in ('-a', '--extract-all'):
 546             options.extractall = 1
 547         elif opt in ('-d', '--default-domain'):
 548             options.outfile = arg + '.pot'
 549         elif opt in ('-E', '--escape'):
 550             options.escape = 1
 551         elif opt in ('-D', '--docstrings'):
 552             options.docstrings = 1
 553         elif opt in ('-k', '--keyword'):
 554             options.keywords.append(arg)
 555         elif opt in ('-K', '--no-default-keywords'):
 556             default_keywords = []
 557         elif opt in ('-n', '--add-location'):
 558             options.writelocations = 1
 559         elif opt in ('--no-location',):
 560             options.writelocations = 0
 561         elif opt in ('-S', '--style'):
 562             options.locationstyle = locations.get(arg.lower())
 563             if options.locationstyle is None:
 564                 usage(1, _('Invalid value for --style: %s') % arg)
 565         elif opt in ('-o', '--output'):
 566             options.outfile = arg
 567         elif opt in ('-p', '--output-dir'):
 568             options.outpath = arg
 569         elif opt in ('-v', '--verbose'):
 570             options.verbose = 1
 571         elif opt in ('-V', '--version'):
 572             print _('pygettext.py (xgettext for Python) %s') % __version__
 573             sys.exit(0)
 574         elif opt in ('-w', '--width'):
 575             try:
 576                 options.width = int(arg)
 577             except ValueError:
 578                 usage(1, _('--width argument must be an integer: %s') % arg)
 579         elif opt in ('-x', '--exclude-file'):
 580             options.excludefilename = arg
 581         elif opt in ('-X', '--no-docstrings'):
 582             fp = open(arg)
 583             try:
 584                 while 1:
 585                     line = fp.readline()
 586                     if not line:
 587                         break
 588                     options.nodocstrings[line[:-1]] = 1
 589             finally:
 590                 fp.close()
 591
 592     # calculate escapes
 593     make_escapes(options.escape)
 594
 595     # calculate all keywords
 596     options.keywords.extend(default_keywords)
 597
 598     # initialize list of strings to exclude
 599     if options.excludefilename:
 600         try:
 601             fp = open(options.excludefilename)
 602             options.toexclude = fp.readlines()
 603             fp.close()
 604         except IOError:
 605             print >> sys.stderr, _(
 606                 "Can't read --exclude-file: %s") % options.excludefilename
 607             sys.exit(1)
 608     else:
 609         options.toexclude = []
 610
 611     # resolve args to module lists
 612     expanded = []
 613     for arg in args:
 614         if arg == '-':
 615             expanded.append(arg)
 616         else:
 617             expanded.extend(getFilesForName(arg))
 618     args = expanded
 619
 620     # slurp through all the files
 621     eater = TokenEater(options)
 622     for filename in args:
 623         if filename == '-':
 624             if options.verbose:
 625                 print _('Reading standard input')
 626             fp = sys.stdin
 627             closep = 0
 628         else:
 629             if options.verbose:
 630                 print _('Working on %s') % filename
 631             fp = open(filename)
 632             closep = 1
 633         try:
 634             eater.set_filename(filename)
 635             try:
 636                 tokenize.tokenize(fp.readline, eater)
 637             except tokenize.TokenError, e:
 638                 print >> sys.stderr, '%s: %s, line %d, column %d' % (
 639                     e[0], filename, e[1][0], e[1][1])
 640         finally:
 641             if closep:
 642                 fp.close()
 643
 644     # write the output
 645     if options.outfile == '-':
 646         fp = sys.stdout
 647         closep = 0
 648     else:
 649         if options.outpath:
 650             options.outfile = os.path.join(options.outpath, options.outfile)
 651         fp = open(options.outfile, 'w')
 652         closep = 1
 653     try:
 654         eater.write(fp)
 655     finally:
 656         if closep:
 657             fp.close()
 658
 659 \f
 660 if __name__ == '__main__':
 661     main()
 662     # some more test strings
 663     _(u'a unicode string')
 664     _('*** Seen unexpected token "%(token)s"' % {'token': 'test'}) # this one creates a warning
 665     _('more' 'than' 'one' 'string')
 666