tools/pygettext.py

   1 #! /usr/bin/env python
   2 # Originally written by Barry Warsaw <bwarsaw@python.org>
   3 #
   4 # minimally patched to make it even more xgettext compatible
   5 # by Peter Funk <pf@artcom-gmbh.de>
   6
   7 # for selftesting
   8 try:
   9     import fintl
  10     _ = fintl.gettext
  11 except ImportError:
  12     def _(s): return s
  13
  14
  15 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
  16
  17 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
  18 internationalization of C programs.  Most of these tools are independent of
  19 the programming language and can be used from within Python programs.  Martin
  20 von Loewis' work[1] helps considerably in this regard.
  21
  22 There's one problem though; xgettext is the program that scans source code
  23 looking for message strings, but it groks only C (or C++).  Python introduces
  24 a few wrinkles, such as dual quoting characters, triple quoted strings, and
  25 raw strings.  xgettext understands none of this.
  26
  27 Enter pygettext, which uses Python's standard tokenize module to scan Python
  28 source code, generating .pot files identical to what GNU xgettext[2] generates
  29 for C and C++ code.  From there, the standard GNU tools can be used.
  30
  31 A word about marking Python strings as candidates for translation.  GNU
  32 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
  33 gettext_noop.  But those can be a lot of text to include all over your code.
  34 C and C++ have a trick: they use the C preprocessor.  Most internationalized C
  35 source includes a #define for gettext() to _() so that what has to be written
  36 in the source is much less.  Thus these are both translatable strings:
  37
  38     gettext("Translatable String")
  39     _("Translatable String")
  40
  41 Python of course has no preprocessor so this doesn't work so well.  Thus,
  42 pygettext searches only for _() by default, but see the -k/--keyword flag
  43 below for how to augment this.
  44
  45  [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
  46  [2] http://www.gnu.org/software/gettext/gettext.html
  47
  48 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
  49 where ever possible.  However some options are still missing or are not fully
  50 implemented.  Also, xgettext's use of command line switches with option
  51 arguments is broken, and in these cases, pygettext just defines additional
  52 switches.
  53
  54 Usage: pygettext [options] inputfile ...
  55
  56 Options:
  57
  58     -a
  59     --extract-all
  60         Extract all strings
  61
  62     -d name
  63     --default-domain=name
  64         Rename the default output file from messages.pot to name.pot
  65
  66     -E
  67     --escape
  68         replace non-ASCII characters with octal escape sequences.
  69
  70     -h
  71     --help
  72         print this help message and exit
  73
  74     -k word
  75     --keyword=word
  76         Keywords to look for in addition to the default set, which are:
  77         %(DEFAULTKEYWORDS)s
  78
  79         You can have multiple -k flags on the command line.
  80
  81     -K
  82     --no-default-keywords
  83         Disable the default set of keywords (see above).  Any keywords
  84         explicitly added with the -k/--keyword option are still recognized.
  85
  86     --no-location
  87         Do not write filename/lineno location comments.
  88
  89     -n
  90     --add-location
  91         Write filename/lineno location comments indicating where each
  92         extracted string is found in the source.  These lines appear before
  93         each msgid.  The style of comments is controlled by the -S/--style
  94         option.  This is the default.
  95
  96     -S stylename
  97     --style stylename
  98         Specify which style to use for location comments.  Two styles are
  99         supported:
 100
 101         Solaris  # File: filename, line: line-number
 102         GNU      #: filename:line
 103
 104         The style name is case insensitive.  GNU style is the default.
 105
 106     -o filename
 107     --output=filename
 108         Rename the default output file from messages.pot to filename.  If
 109         filename is `-' then the output is sent to standard out.
 110
 111     -p dir
 112     --output-dir=dir
 113         Output files will be placed in directory dir.
 114
 115     -v
 116     --verbose
 117         Print the names of the files being processed.
 118
 119     -V
 120     --version
 121         Print the version of pygettext and exit.
 122
 123     -w columns
 124     --width=columns
 125         Set width of output to columns.
 126
 127     -x filename
 128     --exclude-file=filename
 129         Specify a file that contains a list of strings that are not be
 130         extracted from the input files.  Each string to be excluded must
 131         appear on a line by itself in the file.
 132
 133 If `inputfile' is -, standard input is read.
 134
 135 """)
 136
 137 import os
 138 import sys
 139 import time
 140 import getopt
 141 import tokenize
 142
 143 __version__ = '1.1'
 144
 145 default_keywords = ['_']
 146 DEFAULTKEYWORDS = ', '.join(default_keywords)
 147
 148 EMPTYSTRING = ''
 149
 150
 151 \f
 152 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
 153 # it's there.
 154 pot_header = _('''\
 155 # SOME DESCRIPTIVE TITLE.
 156 # Copyright (C) YEAR ORGANIZATION
 157 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
 158 #
 159 msgid ""
 160 msgstr ""
 161 "Project-Id-Version: PACKAGE VERSION\\n"
 162 "PO-Revision-Date: %(time)s\\n"
 163 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 164 "Language-Team: LANGUAGE <LL@li.org>\\n"
 165 "MIME-Version: 1.0\\n"
 166 "Content-Type: text/plain; charset=CHARSET\\n"
 167 "Content-Transfer-Encoding: ENCODING\\n"
 168 "Generated-By: pygettext.py %(version)s\\n"
 169
 170 ''')
 171
 172 \f
 173 def usage(code, msg=''):
 174     print __doc__ % globals()
 175     if msg:
 176         print msg
 177     sys.exit(code)
 178
 179
 180 \f
 181 escapes = []
 182
 183 def make_escapes(pass_iso8859):
 184     global escapes
 185     if pass_iso8859:
 186         # Allow iso-8859 characters to pass through so that e.g. 'msgid
 187         # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
 188         # escape any character outside the 32..126 range.
 189         mod = 128
 190     else:
 191         mod = 256
 192     for i in range(256):
 193         if 32 <= (i % mod) <= 126:
 194             escapes.append(chr(i))
 195         else:
 196             escapes.append("\\%03o" % i)
 197     escapes[ord('\\')] = '\\\\'
 198     escapes[ord('\t')] = '\\t'
 199     escapes[ord('\r')] = '\\r'
 200     escapes[ord('\n')] = '\\n'
 201     escapes[ord('\"')] = '\\"'
 202
 203
 204 def escape(s):
 205     global escapes
 206     s = list(s)
 207     for i in range(len(s)):
 208         s[i] = escapes[ord(s[i])]
 209     return EMPTYSTRING.join(s)
 210
 211
 212 def safe_eval(s):
 213     # unwrap quotes, safely
 214     return eval(s, {'__builtins__':{}}, {})
 215
 216
 217 def normalize(s):
 218     # This converts the various Python string types into a format that is
 219     # appropriate for .po files, namely much closer to C style.
 220     lines = s.split('\n')
 221     if len(lines) == 1:
 222         s = '"' + escape(s) + '"'
 223     else:
 224         if not lines[-1]:
 225             del lines[-1]
 226             lines[-1] = lines[-1] + '\n'
 227         for i in range(len(lines)):
 228             lines[i] = escape(lines[i])
 229         lineterm = '\\n"\n"'
 230         s = '""\n"' + lineterm.join(lines) + '"'
 231     return s
 232
 233
 234 \f
 235 class TokenEater:
 236     def __init__(self, options):
 237         self.__options = options
 238         self.__messages = {}
 239         self.__state = self.__waiting
 240         self.__data = []
 241         self.__lineno = -1
 242
 243     def __call__(self, ttype, tstring, stup, etup, line):
 244         # dispatch
 245         self.__state(ttype, tstring, stup[0])
 246
 247     def __waiting(self, ttype, tstring, lineno):
 248         if ttype == tokenize.NAME and tstring in self.__options.keywords:
 249             self.__state = self.__keywordseen
 250
 251     def __keywordseen(self, ttype, tstring, lineno):
 252         if ttype == tokenize.OP and tstring == '(':
 253             self.__data = []
 254             self.__lineno = lineno
 255             self.__state = self.__openseen
 256         else:
 257             self.__state = self.__waiting
 258
 259     def __openseen(self, ttype, tstring, lineno):
 260         if ttype == tokenize.OP and tstring == ')':
 261             # We've seen the last of the translatable strings.  Record the
 262             # line number of the first line of the strings and update the list
 263             # of messages seen.  Reset state for the next batch.  If there
 264             # were no strings inside _(), then just ignore this entry.
 265             if self.__data:
 266                 msg = EMPTYSTRING.join(self.__data)
 267                 if not msg in self.__options.toexclude:
 268                     entry = (self.__curfile, self.__lineno)
 269                     linenos = self.__messages.get(msg)
 270                     if linenos is None:
 271                         self.__messages[msg] = [entry]
 272                     else:
 273                         linenos.append(entry)
 274             self.__state = self.__waiting
 275         elif ttype == tokenize.STRING:
 276             self.__data.append(safe_eval(tstring))
 277         # TBD: should we warn if we seen anything else?
 278
 279     def set_filename(self, filename):
 280         self.__curfile = filename
 281
 282     def write(self, fp):
 283         options = self.__options
 284         timestamp = time.ctime(time.time())
 285         # common header
 286         try:
 287             sys.stdout = fp
 288             # The time stamp in the header doesn't have the same format
 289             # as that generated by xgettext...
 290             print pot_header % {'time': timestamp, 'version': __version__}
 291             for k, v in self.__messages.items():
 292                 if not options.writelocations:
 293                     pass
 294                 # location comments are different b/w Solaris and GNU:
 295                 elif options.locationstyle == options.SOLARIS:
 296                     for filename, lineno in v:
 297                         d = {'filename': filename, 'lineno': lineno}
 298                         print _('# File: %(filename)s, line: %(lineno)d') % d
 299                 elif options.locationstyle == options.GNU:
 300                     # fit as many locations on one line, as long as the
 301                     # resulting line length doesn't exceeds 'options.width'
 302                     locline = '#:'
 303                     for filename, lineno in v:
 304                         d = {'filename': filename, 'lineno': lineno}
 305                         s = _(' %(filename)s:%(lineno)d') % d
 306                         if len(locline) + len(s) <= options.width:
 307                             locline = locline + s
 308                         else:
 309                             print locline
 310                             locline = "#:" + s
 311                     if len(locline) > 2:
 312                         print locline
 313                 # TBD: sorting, normalizing
 314                 print 'msgid', normalize(k)
 315                 print 'msgstr ""\n'
 316         finally:
 317             sys.stdout = sys.__stdout__
 318
 319 \f
 320 def main():
 321     global default_keywords
 322     try:
 323         opts, args = getopt.getopt(
 324             sys.argv[1:],
 325             'ad:Ehk:Kno:p:S:Vvw:x:',
 326             ['extract-all', 'default-domain', 'escape', 'help',
 327              'keyword=', 'no-default-keywords',
 328              'add-location', 'no-location', 'output=', 'output-dir=',
 329              'style=', 'verbose', 'version', 'width=', 'exclude-file=',
 330              ])
 331     except getopt.error, msg:
 332         usage(1, msg)
 333
 334     # for holding option values
 335     class Options:
 336         # constants
 337         GNU = 1
 338         SOLARIS = 2
 339         # defaults
 340         extractall = 0 # FIXME: currently this option has no effect at all.
 341         escape = 0
 342         keywords = []
 343         outpath = ''
 344         outfile = 'messages.pot'
 345         writelocations = 1
 346         locationstyle = GNU
 347         verbose = 0
 348         width = 78
 349         excludefilename = ''
 350
 351     options = Options()
 352     locations = {'gnu' : options.GNU,
 353                  'solaris' : options.SOLARIS,
 354                  }
 355
 356     # parse options
 357     for opt, arg in opts:
 358         if opt in ('-h', '--help'):
 359             usage(0)
 360         elif opt in ('-a', '--extract-all'):
 361             options.extractall = 1
 362         elif opt in ('-d', '--default-domain'):
 363             options.outfile = arg + '.pot'
 364         elif opt in ('-E', '--escape'):
 365             options.escape = 1
 366         elif opt in ('-k', '--keyword'):
 367             options.keywords.append(arg)
 368         elif opt in ('-K', '--no-default-keywords'):
 369             default_keywords = []
 370         elif opt in ('-n', '--add-location'):
 371             options.writelocations = 1
 372         elif opt in ('--no-location',):
 373             options.writelocations = 0
 374         elif opt in ('-S', '--style'):
 375             options.locationstyle = locations.get(arg.lower())
 376             if options.locationstyle is None:
 377                 usage(1, _('Invalid value for --style: %s') % arg)
 378         elif opt in ('-o', '--output'):
 379             options.outfile = arg
 380         elif opt in ('-p', '--output-dir'):
 381             options.outpath = arg
 382         elif opt in ('-v', '--verbose'):
 383             options.verbose = 1
 384         elif opt in ('-V', '--version'):
 385             print _('pygettext.py (xgettext for Python) %s') % __version__
 386             sys.exit(0)
 387         elif opt in ('-w', '--width'):
 388             try:
 389                 options.width = int(arg)
 390             except ValueError:
 391                 usage(1, _('--width argument must be an integer: %s') % arg)
 392         elif opt in ('-x', '--exclude-file'):
 393             options.excludefilename = arg
 394
 395     # calculate escapes
 396     make_escapes(options.escape)
 397
 398     # calculate all keywords
 399     options.keywords.extend(default_keywords)
 400
 401     # initialize list of strings to exclude
 402     if options.excludefilename:
 403         try:
 404             fp = open(options.excludefilename)
 405             options.toexclude = fp.readlines()
 406             fp.close()
 407         except IOError:
 408             sys.stderr.write(_("Can't read --exclude-file: %s") %
 409                              options.excludefilename)
 410             sys.exit(1)
 411     else:
 412         options.toexclude = []
 413
 414     # slurp through all the files
 415     eater = TokenEater(options)
 416     for filename in args:
 417         if filename == '-':
 418             if options.verbose:
 419                 print _('Reading standard input')
 420             fp = sys.stdin
 421             closep = 0
 422         else:
 423             if options.verbose:
 424                 print _('Working on %s') % filename
 425             fp = open(filename)
 426             closep = 1
 427         try:
 428             eater.set_filename(filename)
 429             tokenize.tokenize(fp.readline, eater)
 430         finally:
 431             if closep:
 432                 fp.close()
 433
 434     # write the output
 435     if options.outfile == '-':
 436         fp = sys.stdout
 437         closep = 0
 438     else:
 439         if options.outpath:
 440             options.outfile = os.path.join(options.outpath, options.outfile)
 441         fp = open(options.outfile, 'w')
 442         closep = 1
 443     try:
 444         eater.write(fp)
 445     finally:
 446         if closep:
 447             fp.close()
 448
 449 \f
 450 if __name__ == '__main__':
 451     main()
 452     # some more test strings
 453     _(u'a unicode string')