tools/pygettext.py

   1 #! /usr/bin/env python
   2 # Originally written by Barry Warsaw <bwarsaw@python.org>
   3 #
   4 # minimally patched to make it even more xgettext compatible
   5 # by Peter Funk <pf@artcom-gmbh.de>
   6 #
   7 # 2001-11-21 Jürgen Hermann <jh@web.de>
   8 # Checks that _() only contains string literals added, and
   9 # command line args are resolved to module lists, i.e. you
  10 # can now pass a filename, a module or package name, or a
  11 # directory (including globbing chars, important for Win32).
  12 #
  13
  14 # for selftesting
  15 try:
  16     import fintl
  17     _ = fintl.gettext
  18 except ImportError:
  19     _ = lambda s: s
  20
  21
  22 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
  23
  24 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
  25 internationalization of C programs.  Most of these tools are independent of
  26 the programming language and can be used from within Python programs.  Martin
  27 von Loewis' work[1] helps considerably in this regard.
  28
  29 There's one problem though; xgettext is the program that scans source code
  30 looking for message strings, but it groks only C (or C++).  Python introduces
  31 a few wrinkles, such as dual quoting characters, triple quoted strings, and
  32 raw strings.  xgettext understands none of this.
  33
  34 Enter pygettext, which uses Python's standard tokenize module to scan Python
  35 source code, generating .pot files identical to what GNU xgettext[2] generates
  36 for C and C++ code.  From there, the standard GNU tools can be used.
  37
  38 A word about marking Python strings as candidates for translation.  GNU
  39 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
  40 gettext_noop.  But those can be a lot of text to include all over your code.
  41 C and C++ have a trick: they use the C preprocessor.  Most internationalized C
  42 source includes a #define for gettext() to _() so that what has to be written
  43 in the source is much less.  Thus these are both translatable strings:
  44
  45     gettext("Translatable String")
  46     _("Translatable String")
  47
  48 Python of course has no preprocessor so this doesn't work so well.  Thus,
  49 pygettext searches only for _() by default, but see the -k/--keyword flag
  50 below for how to augment this.
  51
  52  [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
  53  [2] http://www.gnu.org/software/gettext/gettext.html
  54
  55 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
  56 where ever possible.  However some options are still missing or are not fully
  57 implemented.  Also, xgettext's use of command line switches with option
  58 arguments is broken, and in these cases, pygettext just defines additional
  59 switches.
  60
  61 Usage: pygettext [options] inputfile ...
  62
  63 Options:
  64
  65     -a
  66     --extract-all
  67         Extract all strings
  68
  69     -d name
  70     --default-domain=name
  71         Rename the default output file from messages.pot to name.pot
  72
  73     -E
  74     --escape
  75         replace non-ASCII characters with octal escape sequences.
  76
  77     -h
  78     --help
  79         print this help message and exit
  80
  81     -k word
  82     --keyword=word
  83         Keywords to look for in addition to the default set, which are:
  84         %(DEFAULTKEYWORDS)s
  85
  86         You can have multiple -k flags on the command line.
  87
  88     -K
  89     --no-default-keywords
  90         Disable the default set of keywords (see above).  Any keywords
  91         explicitly added with the -k/--keyword option are still recognized.
  92
  93     --no-location
  94         Do not write filename/lineno location comments.
  95
  96     -n
  97     --add-location
  98         Write filename/lineno location comments indicating where each
  99         extracted string is found in the source.  These lines appear before
 100         each msgid.  The style of comments is controlled by the -S/--style
 101         option.  This is the default.
 102
 103     -S stylename
 104     --style stylename
 105         Specify which style to use for location comments.  Two styles are
 106         supported:
 107
 108         Solaris  # File: filename, line: line-number
 109         GNU      #: filename:line
 110
 111         The style name is case insensitive.  GNU style is the default.
 112
 113     -o filename
 114     --output=filename
 115         Rename the default output file from messages.pot to filename.  If
 116         filename is `-' then the output is sent to standard out.
 117
 118     -p dir
 119     --output-dir=dir
 120         Output files will be placed in directory dir.
 121
 122     -v
 123     --verbose
 124         Print the names of the files being processed.
 125
 126     -V
 127     --version
 128         Print the version of pygettext and exit.
 129
 130     -w columns
 131     --width=columns
 132         Set width of output to columns.
 133
 134     -x filename
 135     --exclude-file=filename
 136         Specify a file that contains a list of strings that are not be
 137         extracted from the input files.  Each string to be excluded must
 138         appear on a line by itself in the file.
 139
 140 If `inputfile' is -, standard input is read.
 141
 142 """)
 143
 144 import os
 145 import sys
 146 import time
 147 import getopt
 148 import token
 149 import tokenize
 150
 151 __version__ = '1.1'
 152
 153 default_keywords = ['_']
 154 DEFAULTKEYWORDS = ', '.join(default_keywords)
 155
 156 EMPTYSTRING = ''
 157
 158
 159 \f
 160 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
 161 # it's there.
 162 pot_header = _('''\
 163 # SOME DESCRIPTIVE TITLE.
 164 # Copyright (C) YEAR ORGANIZATION
 165 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
 166 #
 167 msgid ""
 168 msgstr ""
 169 "Project-Id-Version: PACKAGE VERSION\\n"
 170 "PO-Revision-Date: %(time)s\\n"
 171 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 172 "Language-Team: LANGUAGE <LL@li.org>\\n"
 173 "MIME-Version: 1.0\\n"
 174 "Content-Type: text/plain; charset=CHARSET\\n"
 175 "Content-Transfer-Encoding: ENCODING\\n"
 176 "Generated-By: pygettext.py %(version)s\\n"
 177
 178 ''')
 179
 180 \f
 181 def usage(code, msg=''):
 182     print __doc__ % globals()
 183     if msg:
 184         print msg
 185     sys.exit(code)
 186
 187
 188 \f
 189 escapes = []
 190
 191 def make_escapes(pass_iso8859):
 192     global escapes
 193     if pass_iso8859:
 194         # Allow iso-8859 characters to pass through so that e.g. 'msgid
 195         # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
 196         # escape any character outside the 32..126 range.
 197         mod = 128
 198     else:
 199         mod = 256
 200     for i in range(256):
 201         if 32 <= (i % mod) <= 126:
 202             escapes.append(chr(i))
 203         else:
 204             escapes.append("\\%03o" % i)
 205     escapes[ord('\\')] = '\\\\'
 206     escapes[ord('\t')] = '\\t'
 207     escapes[ord('\r')] = '\\r'
 208     escapes[ord('\n')] = '\\n'
 209     escapes[ord('\"')] = '\\"'
 210
 211
 212 def escape(s):
 213     global escapes
 214     s = list(s)
 215     for i in range(len(s)):
 216         s[i] = escapes[ord(s[i])]
 217     return EMPTYSTRING.join(s)
 218
 219
 220 def safe_eval(s):
 221     # unwrap quotes, safely
 222     return eval(s, {'__builtins__':{}}, {})
 223
 224
 225 def normalize(s):
 226     # This converts the various Python string types into a format that is
 227     # appropriate for .po files, namely much closer to C style.
 228     lines = s.split('\n')
 229     if len(lines) == 1:
 230         s = '"' + escape(s) + '"'
 231     else:
 232         if not lines[-1]:
 233             del lines[-1]
 234             lines[-1] = lines[-1] + '\n'
 235         for i in range(len(lines)):
 236             lines[i] = escape(lines[i])
 237         lineterm = '\\n"\n"'
 238         s = '""\n"' + lineterm.join(lines) + '"'
 239     return s
 240
 241 \f
 242
 243 def containsAny(str, set):
 244     """ Check whether 'str' contains ANY of the chars in 'set'
 245     """
 246     return 1 in [c in str for c in set]
 247
 248
 249 def _visit_pyfiles(list, dirname, names):
 250     """ Helper for getFilesForName().
 251     """
 252     # get extension for python source files
 253     if not globals().has_key('_py_ext'):
 254         import imp
 255         global _py_ext
 256         _py_ext = [triple[0] for triple in imp.get_suffixes() if triple[2] == imp.PY_SOURCE][0]
 257
 258     # don't recurse into CVS directories
 259     if 'CVS' in names:
 260         names.remove('CVS')
 261
 262     # add all *.py files to list
 263     list.extend(
 264         [os.path.join(dirname, file)
 265             for file in names
 266                 if os.path.splitext(file)[1] == _py_ext])
 267
 268
 269 def _get_modpkg_path(dotted_name, pathlist=None):
 270     """ Get the filesystem path for a module or a package.
 271
 272         Return the file system path to a file for a module,
 273         and to a directory for a package. Return None if
 274         the name is not found, or is a builtin or extension module.
 275     """
 276     import imp
 277
 278     # split off top-most name
 279     parts = dotted_name.split('.', 1)
 280
 281     if len(parts) > 1:
 282         # we have a dotted path, import top-level package
 283         try:
 284             file, pathname, description = imp.find_module(parts[0], pathlist)
 285             if file: file.close()
 286         except ImportError:
 287             return None
 288
 289         # check if it's indeed a package
 290         if description[2] == imp.PKG_DIRECTORY:
 291             # recursively handle the remaining name parts
 292             pathname = _get_modpkg_path(parts[1], [pathname])
 293         else:
 294             pathname = None
 295     else:
 296         # plain name
 297         try:
 298             file, pathname, description = imp.find_module(dotted_name, pathlist)
 299             if file: file.close()
 300             if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
 301                 pathname = None
 302         except ImportError:
 303             pathname = None
 304
 305     return pathname
 306
 307
 308 def getFilesForName(name):
 309     """ Get a list of module files for a filename, a module or package name,
 310         or a directory.
 311     """
 312     import imp
 313
 314     if not os.path.exists(name):
 315         # check for glob chars
 316         if containsAny(name, "*?[]"):
 317             import glob
 318             files = glob.glob(name)
 319             list = []
 320             for file in files:
 321                 list.extend(getFilesForName(file))
 322             return list
 323
 324         # try to find module or package
 325         name = _get_modpkg_path(name)
 326         if not name:
 327             return []
 328
 329     if os.path.isdir(name):
 330         # find all python files in directory
 331         list = []
 332         os.path.walk(name, _visit_pyfiles, list)
 333         return list
 334     elif os.path.exists(name):
 335         # a single file
 336         return [name]
 337
 338     return []
 339
 340 \f
 341 class TokenEater:
 342     def __init__(self, options):
 343         self.__options = options
 344         self.__messages = {}
 345         self.__state = self.__waiting
 346         self.__data = []
 347         self.__lineno = -1
 348
 349     def __call__(self, ttype, tstring, stup, etup, line):
 350         # dispatch
 351         self.__state(ttype, tstring, stup[0])
 352
 353     def __waiting(self, ttype, tstring, lineno):
 354         if ttype == tokenize.NAME and tstring in self.__options.keywords:
 355             self.__state = self.__keywordseen
 356
 357     def __keywordseen(self, ttype, tstring, lineno):
 358         if ttype == tokenize.OP and tstring == '(':
 359             self.__data = []
 360             self.__lineno = lineno
 361             self.__state = self.__openseen
 362         else:
 363             self.__state = self.__waiting
 364
 365     def __openseen(self, ttype, tstring, lineno):
 366         if ttype == tokenize.OP and tstring == ')':
 367             # We've seen the last of the translatable strings.  Record the
 368             # line number of the first line of the strings and update the list
 369             # of messages seen.  Reset state for the next batch.  If there
 370             # were no strings inside _(), then just ignore this entry.
 371             if self.__data:
 372                 msg = EMPTYSTRING.join(self.__data)
 373                 if not msg in self.__options.toexclude:
 374                     entry = (self.__curfile, self.__lineno)
 375                     linenos = self.__messages.get(msg)
 376                     if linenos is None:
 377                         self.__messages[msg] = [entry]
 378                     else:
 379                         linenos.append(entry)
 380             self.__state = self.__waiting
 381         elif ttype == tokenize.STRING:
 382             self.__data.append(safe_eval(tstring))
 383         elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
 384                            token.NEWLINE, tokenize.NL]:
 385             # warn if we see anything else than STRING or whitespace
 386             print >>sys.stderr, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
 387                 'token': tstring, 'file': self.__curfile, 'lineno': self.__lineno}
 388             self.__state = self.__waiting
 389
 390     def set_filename(self, filename):
 391         self.__curfile = filename
 392
 393     def write(self, fp):
 394         options = self.__options
 395         timestamp = time.ctime(time.time())
 396         # common header
 397         try:
 398             sys.stdout = fp
 399             # The time stamp in the header doesn't have the same format
 400             # as that generated by xgettext...
 401             print pot_header % {'time': timestamp, 'version': __version__}
 402             for k, v in self.__messages.items():
 403                 if not options.writelocations:
 404                     pass
 405                 # location comments are different b/w Solaris and GNU:
 406                 elif options.locationstyle == options.SOLARIS:
 407                     for filename, lineno in v:
 408                         d = {'filename': filename, 'lineno': lineno}
 409                         print _('# File: %(filename)s, line: %(lineno)d') % d
 410                 elif options.locationstyle == options.GNU:
 411                     # fit as many locations on one line, as long as the
 412                     # resulting line length doesn't exceeds 'options.width'
 413                     locline = '#:'
 414                     for filename, lineno in v:
 415                         d = {'filename': filename, 'lineno': lineno}
 416                         s = _(' %(filename)s:%(lineno)d') % d
 417                         if len(locline) + len(s) <= options.width:
 418                             locline = locline + s
 419                         else:
 420                             print locline
 421                             locline = "#:" + s
 422                     if len(locline) > 2:
 423                         print locline
 424                 # TBD: sorting, normalizing
 425                 print 'msgid', normalize(k)
 426                 print 'msgstr ""\n'
 427         finally:
 428             sys.stdout = sys.__stdout__
 429
 430 \f
 431 def main():
 432     global default_keywords
 433     try:
 434         opts, args = getopt.getopt(
 435             sys.argv[1:],
 436             'ad:Ehk:Kno:p:S:Vvw:x:',
 437             ['extract-all', 'default-domain', 'escape', 'help',
 438              'keyword=', 'no-default-keywords',
 439              'add-location', 'no-location', 'output=', 'output-dir=',
 440              'style=', 'verbose', 'version', 'width=', 'exclude-file=',
 441              ])
 442     except getopt.error, msg:
 443         usage(1, msg)
 444
 445     # for holding option values
 446     class Options:
 447         # constants
 448         GNU = 1
 449         SOLARIS = 2
 450         # defaults
 451         extractall = 0 # FIXME: currently this option has no effect at all.
 452         escape = 0
 453         keywords = []
 454         outpath = ''
 455         outfile = 'messages.pot'
 456         writelocations = 1
 457         locationstyle = GNU
 458         verbose = 0
 459         width = 78
 460         excludefilename = ''
 461
 462     options = Options()
 463     locations = {'gnu' : options.GNU,
 464                  'solaris' : options.SOLARIS,
 465                  }
 466
 467     # parse options
 468     for opt, arg in opts:
 469         if opt in ('-h', '--help'):
 470             usage(0)
 471         elif opt in ('-a', '--extract-all'):
 472             options.extractall = 1
 473         elif opt in ('-d', '--default-domain'):
 474             options.outfile = arg + '.pot'
 475         elif opt in ('-E', '--escape'):
 476             options.escape = 1
 477         elif opt in ('-k', '--keyword'):
 478             options.keywords.append(arg)
 479         elif opt in ('-K', '--no-default-keywords'):
 480             default_keywords = []
 481         elif opt in ('-n', '--add-location'):
 482             options.writelocations = 1
 483         elif opt in ('--no-location',):
 484             options.writelocations = 0
 485         elif opt in ('-S', '--style'):
 486             options.locationstyle = locations.get(arg.lower())
 487             if options.locationstyle is None:
 488                 usage(1, _('Invalid value for --style: %s') % arg)
 489         elif opt in ('-o', '--output'):
 490             options.outfile = arg
 491         elif opt in ('-p', '--output-dir'):
 492             options.outpath = arg
 493         elif opt in ('-v', '--verbose'):
 494             options.verbose = 1
 495         elif opt in ('-V', '--version'):
 496             print _('pygettext.py (xgettext for Python) %s') % __version__
 497             sys.exit(0)
 498         elif opt in ('-w', '--width'):
 499             try:
 500                 options.width = int(arg)
 501             except ValueError:
 502                 usage(1, _('--width argument must be an integer: %s') % arg)
 503         elif opt in ('-x', '--exclude-file'):
 504             options.excludefilename = arg
 505
 506     # calculate escapes
 507     make_escapes(options.escape)
 508
 509     # calculate all keywords
 510     options.keywords.extend(default_keywords)
 511
 512     # initialize list of strings to exclude
 513     if options.excludefilename:
 514         try:
 515             fp = open(options.excludefilename)
 516             options.toexclude = fp.readlines()
 517             fp.close()
 518         except IOError:
 519             sys.stderr.write(_("Can't read --exclude-file: %s") %
 520                              options.excludefilename)
 521             sys.exit(1)
 522     else:
 523         options.toexclude = []
 524
 525     # resolve args to module lists
 526     expanded = []
 527     for arg in args:
 528         expanded.extend(getFilesForName(arg))
 529     args = expanded
 530
 531     # slurp through all the files
 532     eater = TokenEater(options)
 533     for filename in args:
 534         if filename == '-':
 535             if options.verbose:
 536                 print _('Reading standard input')
 537             fp = sys.stdin
 538             closep = 0
 539         else:
 540             if options.verbose:
 541                 print _('Working on %s') % filename
 542             fp = open(filename)
 543             closep = 1
 544         try:
 545             eater.set_filename(filename)
 546             tokenize.tokenize(fp.readline, eater)
 547         finally:
 548             if closep:
 549                 fp.close()
 550
 551     # write the output
 552     if options.outfile == '-':
 553         fp = sys.stdout
 554         closep = 0
 555     else:
 556         if options.outpath:
 557             options.outfile = os.path.join(options.outpath, options.outfile)
 558         fp = open(options.outfile, 'w')
 559         closep = 1
 560     try:
 561         eater.write(fp)
 562     finally:
 563         if closep:
 564             fp.close()
 565
 566 \f
 567 if __name__ == '__main__':
 568     main()
 569     # some more test strings
 570     _(u'a unicode string')
 571     _('*** Seen unexpected token "%(token)s"' % {'token': 'test'})
 572     _('more' 'than' 'one' 'string')
 573