charset.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. # Mapping from LCD source encoding to unicode characters
  2. CUSTOM_CHARS = {
  3. '\x06': '⏬',
  4. '\x04': '🔃',
  5. '\xe4': 'µ',
  6. '\xdf': '°',
  7. '\xe1': 'ä',
  8. '\xe4': 'μ',
  9. '\xef': 'ö',
  10. '\xf5': 'ü',
  11. }
  12. # Charaters to be remapped prior to source-encoding transformation
  13. # This transformation is applied to the translation prior to being converted to the final encoding,
  14. # and maps UTF8 to UTF8. It replaces unavailable symbols in the translation to a close
  15. # representation in the source encoding.
  16. # sources
  17. # https://en.wikipedia.org/wiki/Czech_orthography
  18. # https://en.wikipedia.org/wiki/German_orthography
  19. # https://en.wikipedia.org/wiki/French_orthography
  20. # https://en.wikipedia.org/wiki/Spanish_orthography
  21. # https://en.wikipedia.org/wiki/Italian_orthography
  22. # https://en.wikipedia.org/wiki/Polish_alphabet
  23. # https://en.wikipedia.org/wiki/Dutch_orthography
  24. # https://en.wikipedia.org/wiki/Romanian_alphabet
  25. # https://en.wikipedia.org/wiki/Hungarian_alphabet
  26. # https://en.wikipedia.org/wiki/Gaj%27s_Latin_alphabet
  27. # https://en.wikipedia.org/wiki/Slovak_orthography
  28. # https://en.wikipedia.org/wiki/Swedish_alphabet
  29. # https://en.wikipedia.org/wiki/Norwegian_orthography
  30. TRANS_CHARS = {
  31. 'á': 'a', #cz,fr,es,hu,sk
  32. 'Á': 'A', #cz,fr,hu,sk
  33. 'à': 'a', #fr,it
  34. 'À': 'A', #fr,it
  35. 'â': 'a', #fr,ro
  36. 'Â': 'A', #ro
  37. 'Ä': 'ä', #de,sv,no,sk
  38. 'å': 'a', #sv,no
  39. 'Å': 'A', #sv,no
  40. 'æ': 'ä', #sv,no
  41. 'ą': 'a', #pl
  42. 'Ą': 'A', #pl
  43. 'ă': 'a', #ro
  44. 'Ă': 'A', #ro
  45. 'ć': 'c', #pl,hr
  46. 'Ć': 'C', #pl,hr
  47. 'ç': 'c', #fr,nl
  48. 'č': 'c', #cz,hr,sk
  49. 'Č': 'C', #cz,hr,sk
  50. 'ď': 'd', #cz,sk
  51. 'Ď': 'D', #cz,sk
  52. 'đ': 'd', #hr
  53. 'Đ': 'D', #hr
  54. 'é': 'e', #cz,fr,es,it,nl,hu,sk
  55. 'É': 'E', #cz,fr,it,hu,sk
  56. 'è': 'e', #fr,it,nl
  57. 'È': 'E', #fr,it
  58. 'ê': 'e', #fr,nl
  59. 'ě': 'e', #cz
  60. 'ë': 'e', #fr
  61. 'Ě': 'E', #cz
  62. 'ę': 'e', #pl
  63. 'Ę': 'E', #pl
  64. 'í': 'i', #cz,es,it,sk
  65. 'Í': 'I', #cz,it,sk
  66. 'î': 'i', #fr,ro
  67. 'Î': 'I', #ro
  68. 'ĺ': 'l', #sk
  69. 'Ĺ': 'L', #sk
  70. 'ł': 'l', #pl
  71. 'Ł': 'L', #pl
  72. 'ľ': 'l', #sk
  73. 'Ľ': 'L', #sk
  74. 'ń': 'n', #pl
  75. 'Ń': 'N', #pl
  76. 'ň': 'n', #cz,sk
  77. 'Ň': 'N', #cz,sk
  78. 'ñ': 'n', #es,nl
  79. 'ó': 'o', #cz,es,pl,hu,sk
  80. 'Ó': 'O', #cz,pl,hu,sk
  81. 'ò': 'o', #it
  82. 'Ò': 'O', #it
  83. 'ô': 'o', #fr,nl,sk
  84. 'Ô': 'O', #sk
  85. 'œ': 'o', #fr
  86. 'ø': 'ö', #sv,no
  87. 'Ö': 'ö', #de,sv,no,hu
  88. 'ő': 'o', #hu
  89. 'Ő': 'O', #hu
  90. 'ŕ': 'r', #sk
  91. 'Ŕ': 'R', #sk
  92. 'ř': 'r', #cz
  93. 'Ř': 'R', #cz
  94. 'ś': 's', #pl
  95. 'Ś': 's', #pl
  96. 'š': 's', #cz,hr,sk
  97. 'Š': 'S', #cz,hr,sk
  98. 'ș': 's', #ro
  99. 'Ș': 'S', #ro
  100. 'ß': 'ss',#de
  101. 'ť': 't', #cz,sk
  102. 'Ť': 'T', #cz,sk
  103. 'ț': 't', #ro
  104. 'Ț': 'T', #ro
  105. 'ú': 'u', #cz,es,hu,sk
  106. 'Ú': 'U', #cz,hu,sk
  107. 'ù': 'u', #it
  108. 'Ù': 'U', #it
  109. 'û': 'u', #fr
  110. 'Ü': 'ü', #de,hu
  111. 'ů': 'u', #cz
  112. 'Ů': 'U', #cz
  113. 'ű': 'u', #hu
  114. 'Ű': 'U', #hu
  115. 'ý': 'y', #cz,sk
  116. 'Ý': 'Y', #cz,sk
  117. 'ÿ': 'y', #fr
  118. 'ź': 'z', #pl
  119. 'Ź': 'Z', #pl
  120. 'ž': 'z', #cz,hr,sk
  121. 'Ž': 'z', #cz,hr,sk
  122. 'ż': 'z', #pl
  123. 'Ż': 'Z', #pl
  124. '¿': '', #es
  125. '¡': '', #es
  126. }
  127. def _character_check(buf, valid_chars):
  128. for c in buf:
  129. if (not c.isascii() or not c.isprintable()) and c not in valid_chars:
  130. return c
  131. return None
  132. def source_check(buf):
  133. valid_chars = set(CUSTOM_CHARS.values())
  134. valid_chars.add('\n')
  135. return _character_check(buf, valid_chars)
  136. def translation_check(buf):
  137. valid_chars = set(CUSTOM_CHARS.keys())
  138. valid_chars.add('\n')
  139. return _character_check(buf, valid_chars)
  140. def source_to_unicode(buf):
  141. for src, dst in CUSTOM_CHARS.items():
  142. buf = buf.replace(src, dst)
  143. return buf
  144. def trans_replace(buf):
  145. for src, dst in TRANS_CHARS.items():
  146. buf = buf.replace(src, dst)
  147. return buf
  148. def unicode_to_source(buf):
  149. buf = trans_replace(buf)
  150. for dst, src in CUSTOM_CHARS.items():
  151. buf = buf.replace(src, dst)
  152. return buf