updateDocumentToC.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. #!/usr/bin/env python3
  2. #
  3. # updateDocumentToC.py
  4. #
  5. # Insert table of contents at top of Catch markdown documents.
  6. #
  7. # This script is distributed under the GNU General Public License v3.0
  8. #
  9. # It is based on markdown-toclify version 1.7.1 by Sebastian Raschka,
  10. # https://github.com/rasbt/markdown-toclify
  11. #
  12. from __future__ import print_function
  13. import argparse
  14. import glob
  15. import os
  16. import re
  17. import sys
  18. from scriptCommon import catchPath
  19. # Configuration:
  20. minTocEntries = 4
  21. headingExcludeDefault = [1,3,4,5] # use level 2 headers for at default
  22. headingExcludeRelease = [1,3,4,5] # use level 1 headers for release-notes.md
  23. documentsDefault = os.path.join(os.path.relpath(catchPath), 'docs/*.md')
  24. releaseNotesName = 'release-notes.md'
  25. contentTitle = '**Contents**'
  26. contentLineNo = 4
  27. contentLineNdx = contentLineNo - 1
  28. # End configuration
  29. VALIDS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-&'
  30. def readLines(in_file):
  31. """Returns a list of lines from a input markdown file."""
  32. with open(in_file, 'r') as inf:
  33. in_contents = inf.read().split('\n')
  34. return in_contents
  35. def removeLines(lines, remove=('[[back to top]', '<a class="mk-toclify"')):
  36. """Removes existing [back to top] links and <a id> tags."""
  37. if not remove:
  38. return lines[:]
  39. out = []
  40. for l in lines:
  41. if l.startswith(remove):
  42. continue
  43. out.append(l)
  44. return out
  45. def removeToC(lines):
  46. """Removes existing table of contents starting at index contentLineNdx."""
  47. if not lines[contentLineNdx ].startswith(contentTitle):
  48. return lines[:]
  49. result_top = lines[:contentLineNdx]
  50. pos = contentLineNdx + 1
  51. while lines[pos].startswith('['):
  52. pos = pos + 1
  53. result_bottom = lines[pos + 1:]
  54. return result_top + result_bottom
  55. def dashifyHeadline(line):
  56. """
  57. Takes a header line from a Markdown document and
  58. returns a tuple of the
  59. '#'-stripped version of the head line,
  60. a string version for <a id=''></a> anchor tags,
  61. and the level of the headline as integer.
  62. E.g.,
  63. >>> dashifyHeadline('### some header lvl3')
  64. ('Some header lvl3', 'some-header-lvl3', 3)
  65. """
  66. stripped_right = line.rstrip('#')
  67. stripped_both = stripped_right.lstrip('#')
  68. level = len(stripped_right) - len(stripped_both)
  69. stripped_wspace = stripped_both.strip()
  70. # GitHub's sluggification works in an interesting way
  71. # 1) '+', '/', '(', ')' and so on are just removed
  72. # 2) spaces are converted into '-' directly
  73. # 3) multiple -- are not collapsed
  74. dashified = ''
  75. for c in stripped_wspace:
  76. if c in VALIDS:
  77. dashified += c.lower()
  78. elif c.isspace():
  79. dashified += '-'
  80. else:
  81. # Unknown symbols are just removed
  82. continue
  83. return [stripped_wspace, dashified, level]
  84. def tagAndCollect(lines, id_tag=True, back_links=False, exclude_h=None):
  85. """
  86. Gets headlines from the markdown document and creates anchor tags.
  87. Keyword arguments:
  88. lines: a list of sublists where every sublist
  89. represents a line from a Markdown document.
  90. id_tag: if true, creates inserts a the <a id> tags (not req. by GitHub)
  91. back_links: if true, adds "back to top" links below each headline
  92. exclude_h: header levels to exclude. E.g., [2, 3]
  93. excludes level 2 and 3 headings.
  94. Returns a tuple of 2 lists:
  95. 1st list:
  96. A modified version of the input list where
  97. <a id="some-header"></a> anchor tags where inserted
  98. above the header lines (if github is False).
  99. 2nd list:
  100. A list of 3-value sublists, where the first value
  101. represents the heading, the second value the string
  102. that was inserted assigned to the IDs in the anchor tags,
  103. and the third value is an integer that represents the headline level.
  104. E.g.,
  105. [['some header lvl3', 'some-header-lvl3', 3], ...]
  106. """
  107. out_contents = []
  108. headlines = []
  109. for l in lines:
  110. saw_headline = False
  111. orig_len = len(l)
  112. l_stripped = l.lstrip()
  113. if l_stripped.startswith(('# ', '## ', '### ', '#### ', '##### ', '###### ')):
  114. # comply with new markdown standards
  115. # not a headline if '#' not followed by whitespace '##no-header':
  116. if not l.lstrip('#').startswith(' '):
  117. continue
  118. # not a headline if more than 6 '#':
  119. if len(l) - len(l.lstrip('#')) > 6:
  120. continue
  121. # headers can be indented by at most 3 spaces:
  122. if orig_len - len(l_stripped) > 3:
  123. continue
  124. # ignore empty headers
  125. if not set(l) - {'#', ' '}:
  126. continue
  127. saw_headline = True
  128. dashified = dashifyHeadline(l)
  129. if not exclude_h or not dashified[-1] in exclude_h:
  130. if id_tag:
  131. id_tag = '<a class="mk-toclify" id="%s"></a>'\
  132. % (dashified[1])
  133. out_contents.append(id_tag)
  134. headlines.append(dashified)
  135. out_contents.append(l)
  136. if back_links and saw_headline:
  137. out_contents.append('[[back to top](#table-of-contents)]')
  138. return out_contents, headlines
  139. def positioningHeadlines(headlines):
  140. """
  141. Strips unnecessary whitespaces/tabs if first header is not left-aligned
  142. """
  143. left_just = False
  144. for row in headlines:
  145. if row[-1] == 1:
  146. left_just = True
  147. break
  148. if not left_just:
  149. for row in headlines:
  150. row[-1] -= 1
  151. return headlines
  152. def createToc(headlines, hyperlink=True, top_link=False, no_toc_header=False):
  153. """
  154. Creates the table of contents from the headline list
  155. that was returned by the tagAndCollect function.
  156. Keyword Arguments:
  157. headlines: list of lists
  158. e.g., ['Some header lvl3', 'some-header-lvl3', 3]
  159. hyperlink: Creates hyperlinks in Markdown format if True,
  160. e.g., '- [Some header lvl1](#some-header-lvl1)'
  161. top_link: if True, add a id tag for linking the table
  162. of contents itself (for the back-to-top-links)
  163. no_toc_header: suppresses TOC header if True.
  164. Returns a list of headlines for a table of contents
  165. in Markdown format,
  166. e.g., [' - [Some header lvl3](#some-header-lvl3)', ...]
  167. """
  168. processed = []
  169. if not no_toc_header:
  170. if top_link:
  171. processed.append('<a class="mk-toclify" id="table-of-contents"></a>\n')
  172. processed.append(contentTitle + '<br>')
  173. for line in headlines:
  174. if hyperlink:
  175. item = '[%s](#%s)' % (line[0], line[1])
  176. else:
  177. item = '%s- %s' % ((line[2]-1)*' ', line[0])
  178. processed.append(item + '<br>')
  179. processed.append('\n')
  180. return processed
  181. def buildMarkdown(toc_headlines, body, spacer=0, placeholder=None):
  182. """
  183. Returns a string with the Markdown output contents incl.
  184. the table of contents.
  185. Keyword arguments:
  186. toc_headlines: lines for the table of contents
  187. as created by the createToc function.
  188. body: contents of the Markdown file including
  189. ID-anchor tags as returned by the
  190. tagAndCollect function.
  191. spacer: Adds vertical space after the table
  192. of contents. Height in pixels.
  193. placeholder: If a placeholder string is provided, the placeholder
  194. will be replaced by the TOC instead of inserting the TOC at
  195. the top of the document
  196. """
  197. if spacer:
  198. spacer_line = ['\n<div style="height:%spx;"></div>\n' % (spacer)]
  199. toc_markdown = "\n".join(toc_headlines + spacer_line)
  200. else:
  201. toc_markdown = "\n".join(toc_headlines)
  202. if placeholder:
  203. body_markdown = "\n".join(body)
  204. markdown = body_markdown.replace(placeholder, toc_markdown)
  205. else:
  206. body_markdown_p1 = "\n".join(body[:contentLineNdx ]) + '\n'
  207. body_markdown_p2 = "\n".join(body[ contentLineNdx:])
  208. markdown = body_markdown_p1 + toc_markdown + body_markdown_p2
  209. return markdown
  210. def outputMarkdown(markdown_cont, output_file):
  211. """
  212. Writes to an output file if `outfile` is a valid path.
  213. """
  214. if output_file:
  215. with open(output_file, 'w') as out:
  216. out.write(markdown_cont)
  217. def markdownToclify(
  218. input_file,
  219. output_file=None,
  220. min_toc_len=2,
  221. github=False,
  222. back_to_top=False,
  223. nolink=False,
  224. no_toc_header=False,
  225. spacer=0,
  226. placeholder=None,
  227. exclude_h=None):
  228. """ Function to add table of contents to markdown files.
  229. Parameters
  230. -----------
  231. input_file: str
  232. Path to the markdown input file.
  233. output_file: str (default: None)
  234. Path to the markdown output file.
  235. min_toc_len: int (default: 2)
  236. Miniumum number of entries to create a table of contents for.
  237. github: bool (default: False)
  238. Uses GitHub TOC syntax if True.
  239. back_to_top: bool (default: False)
  240. Inserts back-to-top links below headings if True.
  241. nolink: bool (default: False)
  242. Creates the table of contents without internal links if True.
  243. no_toc_header: bool (default: False)
  244. Suppresses the Table of Contents header if True
  245. spacer: int (default: 0)
  246. Inserts horizontal space (in pixels) after the table of contents.
  247. placeholder: str (default: None)
  248. Inserts the TOC at the placeholder string instead
  249. of inserting the TOC at the top of the document.
  250. exclude_h: list (default None)
  251. Excludes header levels, e.g., if [2, 3], ignores header
  252. levels 2 and 3 in the TOC.
  253. Returns
  254. -----------
  255. changed: Boolean
  256. True if the file has been updated, False otherwise.
  257. """
  258. cleaned_contents = removeLines(
  259. removeToC(readLines(input_file)),
  260. remove=('[[back to top]', '<a class="mk-toclify"'))
  261. processed_contents, raw_headlines = tagAndCollect(
  262. cleaned_contents,
  263. id_tag=not github,
  264. back_links=back_to_top,
  265. exclude_h=exclude_h)
  266. # add table of contents?
  267. if len(raw_headlines) < min_toc_len:
  268. processed_headlines = []
  269. else:
  270. leftjustified_headlines = positioningHeadlines(raw_headlines)
  271. processed_headlines = createToc(
  272. leftjustified_headlines,
  273. hyperlink=not nolink,
  274. top_link=not nolink and not github,
  275. no_toc_header=no_toc_header)
  276. if nolink:
  277. processed_contents = cleaned_contents
  278. cont = buildMarkdown(
  279. toc_headlines=processed_headlines,
  280. body=processed_contents,
  281. spacer=spacer,
  282. placeholder=placeholder)
  283. if output_file:
  284. outputMarkdown(cont, output_file)
  285. def isReleaseNotes(f):
  286. return os.path.basename(f) == releaseNotesName
  287. def excludeHeadingsFor(f):
  288. return headingExcludeRelease if isReleaseNotes(f) else headingExcludeDefault
  289. def updateSingleDocumentToC(input_file, min_toc_len, verbose=False):
  290. """Add or update table of contents in specified file. Return 1 if file changed, 0 otherwise."""
  291. if verbose :
  292. print( 'file: {}'.format(input_file))
  293. output_file = input_file + '.tmp'
  294. markdownToclify(
  295. input_file=input_file,
  296. output_file=output_file,
  297. min_toc_len=min_toc_len,
  298. github=True,
  299. back_to_top=False,
  300. nolink=False,
  301. no_toc_header=False,
  302. spacer=False,
  303. placeholder=False,
  304. exclude_h=excludeHeadingsFor(input_file))
  305. # prevent race-condition (Python 3.3):
  306. if sys.version_info >= (3, 3):
  307. os.replace(output_file, input_file)
  308. else:
  309. os.remove(input_file)
  310. os.rename(output_file, input_file)
  311. return 1
  312. def updateDocumentToC(paths, min_toc_len, verbose):
  313. """Add or update table of contents to specified paths. Return number of changed files"""
  314. n = 0
  315. for g in paths:
  316. for f in glob.glob(g):
  317. if os.path.isfile(f):
  318. n = n + updateSingleDocumentToC(input_file=f, min_toc_len=min_toc_len, verbose=verbose)
  319. return n
  320. def updateDocumentToCMain():
  321. """Add or update table of contents to specified paths."""
  322. parser = argparse.ArgumentParser(
  323. description='Add or update table of contents in markdown documents.',
  324. epilog="""""",
  325. formatter_class=argparse.RawTextHelpFormatter)
  326. parser.add_argument(
  327. 'Input',
  328. metavar='file',
  329. type=str,
  330. nargs=argparse.REMAINDER,
  331. help='files to process, at default: docs/*.md')
  332. parser.add_argument(
  333. '-v', '--verbose',
  334. action='store_true',
  335. help='report the name of the file being processed')
  336. parser.add_argument(
  337. '--min-toc-entries',
  338. dest='minTocEntries',
  339. default=minTocEntries,
  340. type=int,
  341. metavar='N',
  342. help='the minimum number of entries to create a table of contents for [{default}]'.format(default=minTocEntries))
  343. parser.add_argument(
  344. '--remove-toc',
  345. action='store_const',
  346. dest='minTocEntries',
  347. const=99,
  348. help='remove all tables of contents')
  349. args = parser.parse_args()
  350. paths = args.Input if args.Input else [documentsDefault]
  351. changedFiles = updateDocumentToC(paths=paths, min_toc_len=args.minTocEntries, verbose=args.verbose)
  352. if changedFiles > 0:
  353. print( "Processed table of contents in " + str(changedFiles) + " file(s)" )
  354. else:
  355. print( "No table of contents added or updated" )
  356. if __name__ == '__main__':
  357. updateDocumentToCMain()
  358. # end of file