| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305 | #!/usr/bin/env python3import argparseimport bisectimport codecsimport polibimport regeximport sysimport lib.charset as csdef line_warning(path, line, msg):    print(f'{path}:{line}: {msg}', file=sys.stderr)def line_error(path, line, msg):    print(f'{path}:{line}: {msg}', file=sys.stderr)def entry_warning_locs(entries):    for msgid, data in entries:        print('   text: ' + repr(msgid), file=sys.stderr)        positions = ', '.join(map(lambda x: x[0] + ':' + str(x[1]), data['occurrences']))        print('     in: ' + positions, file=sys.stderr)def entries_warning(entries, msg):    print('warning: ' + msg, file=sys.stderr)    entry_warning_locs(entries)def entry_warning(entry, msg):    entries_warning([entry], msg)def newline_positions(source):    lines = [-1]    while True:        idx = source.find('\n', lines[-1] + 1)        if idx < 0:            break        lines.append(idx)    if lines[-1] != len(source) - 1:        lines.append(len(source) - 1)    return lines[1:]def index_to_line(index, lines):    return bisect.bisect_left(lines, index) + 1def extract_file(path, catalog, warn_skipped=False):    source = open(path).read()    newlines = newline_positions(source)    # match internationalized quoted strings    RE_START = r'\b (_[iI]|ISTR) \s* \('    RE_META = r'//// \s* ([^\n]*)$'    RE_I = fr'''        (?<!(?:/[/*]|^\s*\#) [^\n]*)  # not on a comment or preprocessor        {RE_START}                    # $1 ref type _i( or ISTR(        (?:          \s*          ("(?:[^"\\]|\\.)*")         # $2 quoted string (chunk)          (?:\s* {RE_META} )?         # $3 inline metadata        )+        \s* \)                        # )        (?:          (?:[^\n] (?!{RE_START}))*   # anything except another entry          {RE_META}                   # $5 final metadata        )?    '''    r_ref_type = 1    r_quoted_chunk = 2    r_inline_data = 3    r_eol_data = 5    for m in regex.finditer(RE_I, source, regex.M|regex.X):        # parse the text        line = index_to_line(m.start(0), newlines)        text = ""        for block in m.captures(r_quoted_chunk):            # remove quotes and unescape            block = block[1:-1]            block = codecs.decode(block, 'unicode-escape', 'strict')            block = cs.source_to_unicode(block)            text += block        # check if text is non-empty        if len(text) == 0:            line_warning(path, line, 'empty source text, ignored')            continue        data = set()        comments = set()        for n in [r_inline_data, r_eol_data]:            meta = m.group(n)            if meta is not None:                meta_parts = meta.split('//', 1)                data.add(meta_parts[0].strip())                if len(meta_parts) > 1:                    comments.add(meta_parts[1].strip())        # check if this message should be ignored        ignored = False        for meta in data:            if regex.search(r'\bIGNORE\b', meta) is not None:                ignored = True                break        if ignored:            if warn_skipped:                line_warning(path, line, 'skipping explicitly ignored translation')            continue        # extra message catalog name (if any)        cat_name = set()        for meta in data:            sm = regex.search(r'\b(MSG_\w+)', meta)            if sm is not None:                cat_name.add(sm.group(1))        # reference type annotation        ref_type = 'def' if m.group(r_ref_type) == 'ISTR' else 'ref'        if ref_type == 'def':            # ISTR definition: lookup nearby assignment            lineup_def = source[newlines[line-2]+1:m.end(r_ref_type)]            sm = regex.search(r'\b PROGMEM_(\S+) \s*=\s* ISTR $', lineup_def, regex.M|regex.X)            if sm is None:                line_warning(path, line, 'ISTR not used in an assignment')            elif sm.group(1) != 'I1':                line_warning(path, line, 'ISTR not used with PROGMEM_I1')        # append the translation to the catalog        pos = [(path, line)]        entry = catalog.get(text)        if entry is None:            catalog[text] = {'occurrences': set(pos),                             'data': data,                             'cat_name': cat_name,                             'comments': comments,                             'ref_type': set([ref_type])}        else:            entry['occurrences'] = entry['occurrences'].union(pos)            entry['data'] = entry['data'].union(data)            entry['cat_name'] = entry['cat_name'].union(cat_name)            entry['comments'] = entry['comments'].union(comments)            entry['ref_type'].add(ref_type)def extract_refs(path, catalog):    source = open(path).read()    newlines = newline_positions(source)    # match message catalog references to add backrefs    RE_CAT = r'''        (?<!(?:/[/*]|^\s*\#) [^\n]*)          # not on a comment or preprocessor        \b (?:_[TOR]) \s* \( \s* (\w+) \s* \) # $1 catalog name    '''    for m in regex.finditer(RE_CAT, source, regex.M|regex.X):        line = index_to_line(m.start(0), newlines)        pos = [(path, line)]        cat_name = m.group(1)        found = False        defined = False        for entry in catalog.values():            if cat_name in entry['cat_name']:                entry['occurrences'] = entry['occurrences'].union(pos)                entry['ref_type'].add('ref')                found = True                if 'def' in entry['ref_type']:                    defined = True        if not found:            line_error(path, line, f'{cat_name} not found')        elif not defined:            line_error(path, line, f'{cat_name} referenced but never defined')def check_entries(catalog, warn_missing, warn_same_line):    cat_entries = {}    for entry in catalog.items():        msgid, data = entry        # ensure we have at least one name        if len(data['cat_name']) == 0 and warn_missing:            entry_warning(entry, 'missing MSG identifier')        # ensure references are being defined        if data['ref_type'] == set(['def']):            if len(data['cat_name']) == 0:                if warn_missing:                    entry_warning(entry, 'entry defined, but never used')            else:                id_name = next(iter(data['cat_name']))                entry_warning(entry, f'{id_name} defined, but never used')        # check custom characters        invalid_char = cs.source_check(msgid)        if invalid_char is not None:            entry_warning(entry, 'source contains unhandled custom character ' + repr(invalid_char))        tokens = []        for meta in data['data']:            tokens.extend(regex.split(r'\s+', meta))        seen_keys = set()        for token in tokens:            if len(token) == 0:                continue            # check metadata syntax            if regex.match(r'[cr]=\d+$', token) is None and \               regex.match(r'MSG_[A-Z_0-9]+$', token) is None:                entry_warning(entry, 'bogus annotation: ' + repr(token))            # check for repeated keys            key = regex.match(r'([^=])+=', token)            if key is not None:                key_name = key.group(1)                if key_name in seen_keys:                    entry_warning(entry, 'repeated annotation: ' + repr(token))                else:                    seen_keys.add(key_name)            # build the inverse catalog map            if token.startswith('MSG_'):                if token not in cat_entries:                    cat_entries[token] = [entry]                else:                    cat_entries[token].append(entry)    # ensure the same id is not used in multiple entries    for cat_name, entries in cat_entries.items():        if len(entries) > 1:            entries_warning(entries, f'{cat_name} used in multiple translations')    if warn_same_line:        # build the inverse location map        entry_locs = {}        for entry in catalog.items():            msgid, data = entry            for loc in data['occurrences']:                if loc not in entry_locs:                    entry_locs[loc] = [loc]                else:                    entry_locs[loc].append(loc)        # check for multiple translations on the same location        for loc, entries in entry_locs.items():            if len(entries) > 1:                line_warning(loc[0], loc[1], f'line contains multiple translations')def main():    ap = argparse.ArgumentParser()    ap.add_argument('-o', dest='pot', required=True, help='PO template output file')    ap.add_argument('--no-missing', action='store_true',                    help='Do not warn about missing MSG entries')    ap.add_argument('--warn-same-line', action='store_true',                    help='Warn about multiple translations on the same line')    ap.add_argument('--warn-skipped', action='store_true',                    help='Warn about explicitly ignored translations')    ap.add_argument('-s', '--sort', action='store_true',                    help='Sort output catalog')    ap.add_argument('file', nargs='+', help='Input files')    args = ap.parse_args()    # extract strings    catalog = {}    for path in args.file:        extract_file(path, catalog, warn_skipped=args.warn_skipped)    # process backreferences in a 2nd pass    for path in args.file:        extract_refs(path, catalog)    # check the catalog entries    check_entries(catalog, warn_missing=not args.no_missing, warn_same_line=args.warn_same_line)    # write the output PO template    po = polib.POFile()    po.metadata = {        'Language': 'en',        'MIME-Version': '1.0',        'Content-Type': 'text/plain; charset=utf-8',        'Content-Transfer-Encoding': '8bit'}    messages = catalog.keys()    if args.sort:        messages = sorted(messages)    for msgid in messages:        data = catalog[msgid]        comment = ', '.join(data['data'])        if len(data['comments']):            comment += '\n' + '\n'.join(data['comments'])        occurrences = data['occurrences']        if args.sort:            occurrences = list(sorted(occurrences))        po.append(            polib.POEntry(                msgid=msgid,                comment=comment,                occurrences=occurrences))    po.save(args.pot)    return 0if __name__ == '__main__':    exit(main())
 |