lang-extract.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. #!/usr/bin/env python3
  2. import argparse
  3. import bisect
  4. import codecs
  5. import polib
  6. import regex
  7. import sys
  8. import lib.charset as cs
  9. def line_warning(path, line, msg):
  10. print(f'{path}:{line}: {msg}', file=sys.stderr)
  11. def line_error(path, line, msg):
  12. print(f'{path}:{line}: {msg}', file=sys.stderr)
  13. def entry_warning_locs(entries):
  14. for msgid, data in entries:
  15. print(' text: ' + repr(msgid), file=sys.stderr)
  16. positions = ', '.join(map(lambda x: x[0] + ':' + str(x[1]), data['occurrences']))
  17. print(' in: ' + positions, file=sys.stderr)
  18. def entries_warning(entries, msg):
  19. print('warning: ' + msg, file=sys.stderr)
  20. entry_warning_locs(entries)
  21. def entry_warning(entry, msg):
  22. entries_warning([entry], msg)
  23. def newline_positions(source):
  24. lines = [-1]
  25. while True:
  26. idx = source.find('\n', lines[-1] + 1)
  27. if idx < 0:
  28. break
  29. lines.append(idx)
  30. if lines[-1] != len(source) - 1:
  31. lines.append(len(source) - 1)
  32. return lines[1:]
  33. def index_to_line(index, lines):
  34. return bisect.bisect_left(lines, index) + 1
  35. def extract_file(path, catalog, warn_skipped=False):
  36. source = open(path).read()
  37. newlines = newline_positions(source)
  38. # match internationalized quoted strings
  39. RE_START = r'\b (_[iI]|ISTR) \s* \('
  40. RE_META = r'//// \s* ([^\n]*)$'
  41. RE_I = fr'''
  42. (?<!(?:/[/*]|^\s*\#) [^\n]*) # not on a comment or preprocessor
  43. {RE_START} # $1 ref type _i( or ISTR(
  44. (?:
  45. \s*
  46. ("(?:[^"\\]|\\.)*") # $2 quoted string (chunk)
  47. (?:\s* {RE_META} )? # $3 inline metadata
  48. )+
  49. \s* \) # )
  50. (?:
  51. (?:[^\n] (?!{RE_START}))* # anything except another entry
  52. {RE_META} # $5 final metadata
  53. )?
  54. '''
  55. r_ref_type = 1
  56. r_quoted_chunk = 2
  57. r_inline_data = 3
  58. r_eol_data = 5
  59. for m in regex.finditer(RE_I, source, regex.M|regex.X):
  60. # parse the text
  61. line = index_to_line(m.start(0), newlines)
  62. text = ""
  63. for block in m.captures(r_quoted_chunk):
  64. # remove quotes and unescape
  65. block = block[1:-1]
  66. block = codecs.decode(block, 'unicode-escape', 'strict')
  67. block = cs.source_to_unicode(block)
  68. text += block
  69. # check if text is non-empty
  70. if len(text) == 0:
  71. line_warning(path, line, 'empty source text, ignored')
  72. continue
  73. data = set()
  74. comments = set()
  75. for n in [r_inline_data, r_eol_data]:
  76. meta = m.group(n)
  77. if meta is not None:
  78. meta_parts = meta.split('//', 1)
  79. data.add(meta_parts[0].strip())
  80. if len(meta_parts) > 1:
  81. comments.add(meta_parts[1].strip())
  82. # check if this message should be ignored
  83. ignored = False
  84. for meta in data:
  85. if regex.search(r'\bIGNORE\b', meta) is not None:
  86. ignored = True
  87. break
  88. if ignored:
  89. if warn_skipped:
  90. line_warning(path, line, 'skipping explicitly ignored translation')
  91. continue
  92. # extra message catalog name (if any)
  93. cat_name = set()
  94. for meta in data:
  95. sm = regex.search(r'\b(MSG_\w+)', meta)
  96. if sm is not None:
  97. cat_name.add(sm.group(1))
  98. # reference type annotation
  99. ref_type = 'def' if m.group(r_ref_type) == 'ISTR' else 'ref'
  100. if ref_type == 'def':
  101. # ISTR definition: lookup nearby assignment
  102. lineup_def = source[newlines[line-2]+1:m.end(r_ref_type)]
  103. sm = regex.search(r'\b PROGMEM_(\S+) \s*=\s* ISTR $', lineup_def, regex.M|regex.X)
  104. if sm is None:
  105. line_warning(path, line, 'ISTR not used in an assignment')
  106. elif sm.group(1) != 'I1':
  107. line_warning(path, line, 'ISTR not used with PROGMEM_I1')
  108. # append the translation to the catalog
  109. pos = [(path, line)]
  110. entry = catalog.get(text)
  111. if entry is None:
  112. catalog[text] = {'occurrences': set(pos),
  113. 'data': data,
  114. 'cat_name': cat_name,
  115. 'comments': comments,
  116. 'ref_type': set([ref_type])}
  117. else:
  118. entry['occurrences'] = entry['occurrences'].union(pos)
  119. entry['data'] = entry['data'].union(data)
  120. entry['cat_name'] = entry['cat_name'].union(cat_name)
  121. entry['comments'] = entry['comments'].union(comments)
  122. entry['ref_type'].add(ref_type)
  123. def extract_refs(path, catalog):
  124. source = open(path).read()
  125. newlines = newline_positions(source)
  126. # match message catalog references to add backrefs
  127. RE_CAT = r'''
  128. (?<!(?:/[/*]|^\s*\#) [^\n]*) # not on a comment or preprocessor
  129. \b (?:_[TO]) \s* \( \s* (\w+) \s* \) # $1 catalog name
  130. '''
  131. for m in regex.finditer(RE_CAT, source, regex.M|regex.X):
  132. line = index_to_line(m.start(0), newlines)
  133. pos = [(path, line)]
  134. cat_name = m.group(1)
  135. found = False
  136. defined = False
  137. for entry in catalog.values():
  138. if cat_name in entry['cat_name']:
  139. entry['occurrences'] = entry['occurrences'].union(pos)
  140. entry['ref_type'].add('ref')
  141. found = True
  142. if 'def' in entry['ref_type']:
  143. defined = True
  144. if not found:
  145. line_error(path, line, f'{cat_name} not found')
  146. elif not defined:
  147. line_error(path, line, f'{cat_name} referenced but never defined')
  148. def check_entries(catalog, warn_missing, warn_same_line):
  149. cat_entries = {}
  150. for entry in catalog.items():
  151. msgid, data = entry
  152. # ensure we have at least one name
  153. if len(data['cat_name']) == 0 and warn_missing:
  154. entry_warning(entry, 'missing MSG identifier')
  155. # ensure references are being defined
  156. if data['ref_type'] == set(['def']):
  157. if len(data['cat_name']) == 0:
  158. if warn_missing:
  159. entry_warning(entry, 'entry defined, but never used')
  160. else:
  161. id_name = next(iter(data['cat_name']))
  162. entry_warning(entry, f'{id_name} defined, but never used')
  163. # check custom characters
  164. invalid_char = cs.source_check(msgid)
  165. if invalid_char is not None:
  166. entry_warning(entry, 'source contains unhandled custom character ' + repr(invalid_char))
  167. tokens = []
  168. for meta in data['data']:
  169. tokens.extend(regex.split(r'\s+', meta))
  170. seen_keys = set()
  171. for token in tokens:
  172. if len(token) == 0:
  173. continue
  174. # check metadata syntax
  175. if regex.match(r'[cr]=\d+$', token) is None and \
  176. regex.match(r'MSG_[A-Z_0-9]+$', token) is None:
  177. entry_warning(entry, 'bogus annotation: ' + repr(token))
  178. # check for repeated keys
  179. key = regex.match(r'([^=])+=', token)
  180. if key is not None:
  181. key_name = key.group(1)
  182. if key_name in seen_keys:
  183. entry_warning(entry, 'repeated annotation: ' + repr(token))
  184. else:
  185. seen_keys.add(key_name)
  186. # build the inverse catalog map
  187. if token.startswith('MSG_'):
  188. if token not in cat_entries:
  189. cat_entries[token] = [entry]
  190. else:
  191. cat_entries[token].append(entry)
  192. # ensure the same id is not used in multiple entries
  193. for cat_name, entries in cat_entries.items():
  194. if len(entries) > 1:
  195. entries_warning(entries, f'{cat_name} used in multiple translations')
  196. if warn_same_line:
  197. # build the inverse location map
  198. entry_locs = {}
  199. for entry in catalog.items():
  200. msgid, data = entry
  201. for loc in data['occurrences']:
  202. if loc not in entry_locs:
  203. entry_locs[loc] = [loc]
  204. else:
  205. entry_locs[loc].append(loc)
  206. # check for multiple translations on the same location
  207. for loc, entries in entry_locs.items():
  208. if len(entries) > 1:
  209. line_warning(loc[0], loc[1], f'line contains multiple translations')
  210. def main():
  211. ap = argparse.ArgumentParser()
  212. ap.add_argument('-o', dest='pot', required=True, help='PO template output file')
  213. ap.add_argument('--no-missing', action='store_true',
  214. help='Do not warn about missing MSG entries')
  215. ap.add_argument('--warn-same-line', action='store_true',
  216. help='Warn about multiple translations on the same line')
  217. ap.add_argument('--warn-skipped', action='store_true',
  218. help='Warn about explicitly ignored translations')
  219. ap.add_argument('-s', '--sort', action='store_true',
  220. help='Sort output catalog')
  221. ap.add_argument('file', nargs='+', help='Input files')
  222. args = ap.parse_args()
  223. # extract strings
  224. catalog = {}
  225. for path in args.file:
  226. extract_file(path, catalog, warn_skipped=args.warn_skipped)
  227. # process backreferences in a 2nd pass
  228. for path in args.file:
  229. extract_refs(path, catalog)
  230. # check the catalog entries
  231. check_entries(catalog, warn_missing=not args.no_missing, warn_same_line=args.warn_same_line)
  232. # write the output PO template
  233. po = polib.POFile()
  234. po.metadata = {
  235. 'Language': 'en',
  236. 'MIME-Version': '1.0',
  237. 'Content-Type': 'text/plain; charset=utf-8',
  238. 'Content-Transfer-Encoding': '8bit'}
  239. messages = catalog.keys()
  240. if args.sort:
  241. messages = sorted(messages)
  242. for msgid in messages:
  243. data = catalog[msgid]
  244. comment = ', '.join(data['data'])
  245. if len(data['comments']):
  246. comment += '\n' + '\n'.join(data['comments'])
  247. occurrences = data['occurrences']
  248. if args.sort:
  249. occurrences = list(sorted(occurrences))
  250. po.append(
  251. polib.POEntry(
  252. msgid=msgid,
  253. comment=comment,
  254. occurrences=occurrences))
  255. po.save(args.pot)
  256. return 0
  257. if __name__ == '__main__':
  258. exit(main())