1 #! /usr/bin/env python
2 # Originally written by Barry Warsaw <bwarsaw@python.org>
3 #
4 # minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
7 # for selftesting
8 try:
9 import fintl
10 _ = fintl.gettext
11 except ImportError:
12 def _(s): return s
15 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
17 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
18 internationalization of C programs. Most of these tools are independent of
19 the programming language and can be used from within Python programs. Martin
20 von Loewis' work[1] helps considerably in this regard.
22 There's one problem though; xgettext is the program that scans source code
23 looking for message strings, but it groks only C (or C++). Python introduces
24 a few wrinkles, such as dual quoting characters, triple quoted strings, and
25 raw strings. xgettext understands none of this.
27 Enter pygettext, which uses Python's standard tokenize module to scan Python
28 source code, generating .pot files identical to what GNU xgettext[2] generates
29 for C and C++ code. From there, the standard GNU tools can be used.
31 A word about marking Python strings as candidates for translation. GNU
32 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
33 gettext_noop. But those can be a lot of text to include all over your code.
34 C and C++ have a trick: they use the C preprocessor. Most internationalized C
35 source includes a #define for gettext() to _() so that what has to be written
36 in the source is much less. Thus these are both translatable strings:
38 gettext("Translatable String")
39 _("Translatable String")
41 Python of course has no preprocessor so this doesn't work so well. Thus,
42 pygettext searches only for _() by default, but see the -k/--keyword flag
43 below for how to augment this.
45 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
46 [2] http://www.gnu.org/software/gettext/gettext.html
48 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
49 where ever possible. However some options are still missing or are not fully
50 implemented. Also, xgettext's use of command line switches with option
51 arguments is broken, and in these cases, pygettext just defines additional
52 switches.
54 Usage: pygettext [options] inputfile ...
56 Options:
58 -a
59 --extract-all
60 Extract all strings
62 -d name
63 --default-domain=name
64 Rename the default output file from messages.pot to name.pot
66 -E
67 --escape
68 replace non-ASCII characters with octal escape sequences.
70 -h
71 --help
72 print this help message and exit
74 -k word
75 --keyword=word
76 Keywords to look for in addition to the default set, which are:
77 %(DEFAULTKEYWORDS)s
79 You can have multiple -k flags on the command line.
81 -K
82 --no-default-keywords
83 Disable the default set of keywords (see above). Any keywords
84 explicitly added with the -k/--keyword option are still recognized.
86 --no-location
87 Do not write filename/lineno location comments.
89 -n
90 --add-location
91 Write filename/lineno location comments indicating where each
92 extracted string is found in the source. These lines appear before
93 each msgid. The style of comments is controlled by the -S/--style
94 option. This is the default.
96 -S stylename
97 --style stylename
98 Specify which style to use for location comments. Two styles are
99 supported:
101 Solaris # File: filename, line: line-number
102 GNU #: filename:line
104 The style name is case insensitive. GNU style is the default.
106 -o filename
107 --output=filename
108 Rename the default output file from messages.pot to filename. If
109 filename is `-' then the output is sent to standard out.
111 -p dir
112 --output-dir=dir
113 Output files will be placed in directory dir.
115 -v
116 --verbose
117 Print the names of the files being processed.
119 -V
120 --version
121 Print the version of pygettext and exit.
123 -w columns
124 --width=columns
125 Set width of output to columns.
127 -x filename
128 --exclude-file=filename
129 Specify a file that contains a list of strings that are not be
130 extracted from the input files. Each string to be excluded must
131 appear on a line by itself in the file.
133 If `inputfile' is -, standard input is read.
135 """)
137 import os
138 import sys
139 import time
140 import getopt
141 import tokenize
143 __version__ = '1.1'
145 default_keywords = ['_']
146 DEFAULTKEYWORDS = ', '.join(default_keywords)
148 EMPTYSTRING = ''
151 \f
152 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
153 # it's there.
154 pot_header = _('''\
155 # SOME DESCRIPTIVE TITLE.
156 # Copyright (C) YEAR ORGANIZATION
157 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
158 #
159 msgid ""
160 msgstr ""
161 "Project-Id-Version: PACKAGE VERSION\\n"
162 "PO-Revision-Date: %(time)s\\n"
163 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
164 "Language-Team: LANGUAGE <LL@li.org>\\n"
165 "MIME-Version: 1.0\\n"
166 "Content-Type: text/plain; charset=CHARSET\\n"
167 "Content-Transfer-Encoding: ENCODING\\n"
168 "Generated-By: pygettext.py %(version)s\\n"
170 ''')
172 \f
173 def usage(code, msg=''):
174 print __doc__ % globals()
175 if msg:
176 print msg
177 sys.exit(code)
180 \f
181 escapes = []
183 def make_escapes(pass_iso8859):
184 global escapes
185 if pass_iso8859:
186 # Allow iso-8859 characters to pass through so that e.g. 'msgid
187 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
188 # escape any character outside the 32..126 range.
189 mod = 128
190 else:
191 mod = 256
192 for i in range(256):
193 if 32 <= (i % mod) <= 126:
194 escapes.append(chr(i))
195 else:
196 escapes.append("\\%03o" % i)
197 escapes[ord('\\')] = '\\\\'
198 escapes[ord('\t')] = '\\t'
199 escapes[ord('\r')] = '\\r'
200 escapes[ord('\n')] = '\\n'
201 escapes[ord('\"')] = '\\"'
204 def escape(s):
205 global escapes
206 s = list(s)
207 for i in range(len(s)):
208 s[i] = escapes[ord(s[i])]
209 return EMPTYSTRING.join(s)
212 def safe_eval(s):
213 # unwrap quotes, safely
214 return eval(s, {'__builtins__':{}}, {})
217 def normalize(s):
218 # This converts the various Python string types into a format that is
219 # appropriate for .po files, namely much closer to C style.
220 lines = s.split('\n')
221 if len(lines) == 1:
222 s = '"' + escape(s) + '"'
223 else:
224 if not lines[-1]:
225 del lines[-1]
226 lines[-1] = lines[-1] + '\n'
227 for i in range(len(lines)):
228 lines[i] = escape(lines[i])
229 lineterm = '\\n"\n"'
230 s = '""\n"' + lineterm.join(lines) + '"'
231 return s
234 \f
235 class TokenEater:
236 def __init__(self, options):
237 self.__options = options
238 self.__messages = {}
239 self.__state = self.__waiting
240 self.__data = []
241 self.__lineno = -1
243 def __call__(self, ttype, tstring, stup, etup, line):
244 # dispatch
245 self.__state(ttype, tstring, stup[0])
247 def __waiting(self, ttype, tstring, lineno):
248 if ttype == tokenize.NAME and tstring in self.__options.keywords:
249 self.__state = self.__keywordseen
251 def __keywordseen(self, ttype, tstring, lineno):
252 if ttype == tokenize.OP and tstring == '(':
253 self.__data = []
254 self.__lineno = lineno
255 self.__state = self.__openseen
256 else:
257 self.__state = self.__waiting
259 def __openseen(self, ttype, tstring, lineno):
260 if ttype == tokenize.OP and tstring == ')':
261 # We've seen the last of the translatable strings. Record the
262 # line number of the first line of the strings and update the list
263 # of messages seen. Reset state for the next batch. If there
264 # were no strings inside _(), then just ignore this entry.
265 if self.__data:
266 msg = EMPTYSTRING.join(self.__data)
267 if not msg in self.__options.toexclude:
268 entry = (self.__curfile, self.__lineno)
269 linenos = self.__messages.get(msg)
270 if linenos is None:
271 self.__messages[msg] = [entry]
272 else:
273 linenos.append(entry)
274 self.__state = self.__waiting
275 elif ttype == tokenize.STRING:
276 self.__data.append(safe_eval(tstring))
277 # TBD: should we warn if we seen anything else?
279 def set_filename(self, filename):
280 self.__curfile = filename
282 def write(self, fp):
283 options = self.__options
284 timestamp = time.ctime(time.time())
285 # common header
286 try:
287 sys.stdout = fp
288 # The time stamp in the header doesn't have the same format
289 # as that generated by xgettext...
290 print pot_header % {'time': timestamp, 'version': __version__}
291 for k, v in self.__messages.items():
292 if not options.writelocations:
293 pass
294 # location comments are different b/w Solaris and GNU:
295 elif options.locationstyle == options.SOLARIS:
296 for filename, lineno in v:
297 d = {'filename': filename, 'lineno': lineno}
298 print _('# File: %(filename)s, line: %(lineno)d') % d
299 elif options.locationstyle == options.GNU:
300 # fit as many locations on one line, as long as the
301 # resulting line length doesn't exceeds 'options.width'
302 locline = '#:'
303 for filename, lineno in v:
304 d = {'filename': filename, 'lineno': lineno}
305 s = _(' %(filename)s:%(lineno)d') % d
306 if len(locline) + len(s) <= options.width:
307 locline = locline + s
308 else:
309 print locline
310 locline = "#:" + s
311 if len(locline) > 2:
312 print locline
313 # TBD: sorting, normalizing
314 print 'msgid', normalize(k)
315 print 'msgstr ""\n'
316 finally:
317 sys.stdout = sys.__stdout__
319 \f
320 def main():
321 global default_keywords
322 try:
323 opts, args = getopt.getopt(
324 sys.argv[1:],
325 'ad:Ehk:Kno:p:S:Vvw:x:',
326 ['extract-all', 'default-domain', 'escape', 'help',
327 'keyword=', 'no-default-keywords',
328 'add-location', 'no-location', 'output=', 'output-dir=',
329 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
330 ])
331 except getopt.error, msg:
332 usage(1, msg)
334 # for holding option values
335 class Options:
336 # constants
337 GNU = 1
338 SOLARIS = 2
339 # defaults
340 extractall = 0 # FIXME: currently this option has no effect at all.
341 escape = 0
342 keywords = []
343 outpath = ''
344 outfile = 'messages.pot'
345 writelocations = 1
346 locationstyle = GNU
347 verbose = 0
348 width = 78
349 excludefilename = ''
351 options = Options()
352 locations = {'gnu' : options.GNU,
353 'solaris' : options.SOLARIS,
354 }
356 # parse options
357 for opt, arg in opts:
358 if opt in ('-h', '--help'):
359 usage(0)
360 elif opt in ('-a', '--extract-all'):
361 options.extractall = 1
362 elif opt in ('-d', '--default-domain'):
363 options.outfile = arg + '.pot'
364 elif opt in ('-E', '--escape'):
365 options.escape = 1
366 elif opt in ('-k', '--keyword'):
367 options.keywords.append(arg)
368 elif opt in ('-K', '--no-default-keywords'):
369 default_keywords = []
370 elif opt in ('-n', '--add-location'):
371 options.writelocations = 1
372 elif opt in ('--no-location',):
373 options.writelocations = 0
374 elif opt in ('-S', '--style'):
375 options.locationstyle = locations.get(arg.lower())
376 if options.locationstyle is None:
377 usage(1, _('Invalid value for --style: %s') % arg)
378 elif opt in ('-o', '--output'):
379 options.outfile = arg
380 elif opt in ('-p', '--output-dir'):
381 options.outpath = arg
382 elif opt in ('-v', '--verbose'):
383 options.verbose = 1
384 elif opt in ('-V', '--version'):
385 print _('pygettext.py (xgettext for Python) %s') % __version__
386 sys.exit(0)
387 elif opt in ('-w', '--width'):
388 try:
389 options.width = int(arg)
390 except ValueError:
391 usage(1, _('--width argument must be an integer: %s') % arg)
392 elif opt in ('-x', '--exclude-file'):
393 options.excludefilename = arg
395 # calculate escapes
396 make_escapes(options.escape)
398 # calculate all keywords
399 options.keywords.extend(default_keywords)
401 # initialize list of strings to exclude
402 if options.excludefilename:
403 try:
404 fp = open(options.excludefilename)
405 options.toexclude = fp.readlines()
406 fp.close()
407 except IOError:
408 sys.stderr.write(_("Can't read --exclude-file: %s") %
409 options.excludefilename)
410 sys.exit(1)
411 else:
412 options.toexclude = []
414 # slurp through all the files
415 eater = TokenEater(options)
416 for filename in args:
417 if filename == '-':
418 if options.verbose:
419 print _('Reading standard input')
420 fp = sys.stdin
421 closep = 0
422 else:
423 if options.verbose:
424 print _('Working on %s') % filename
425 fp = open(filename)
426 closep = 1
427 try:
428 eater.set_filename(filename)
429 tokenize.tokenize(fp.readline, eater)
430 finally:
431 if closep:
432 fp.close()
434 # write the output
435 if options.outfile == '-':
436 fp = sys.stdout
437 closep = 0
438 else:
439 if options.outpath:
440 options.outfile = os.path.join(options.outpath, options.outfile)
441 fp = open(options.outfile, 'w')
442 closep = 1
443 try:
444 eater.write(fp)
445 finally:
446 if closep:
447 fp.close()
449 \f
450 if __name__ == '__main__':
451 main()
452 # some more test strings
453 _(u'a unicode string')