From b014dbb9927bc1f7c67028eb85354418b043bcfb Mon Sep 17 00:00:00 2001 From: Edoardo Baldi Date: Mon, 12 May 2025 22:46:26 +0200 Subject: [PATCH 1/4] Some quality-of-life enhancement to toc.py script Add proper error handling, logging, command-line options, and improved documentation. Clean anchor generation, add version, and enhance CLI help text with better descriptions and examples. --- tutorial/toc.py | 182 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 159 insertions(+), 23 deletions(-) diff --git a/tutorial/toc.py b/tutorial/toc.py index c1a382ed..444a36dd 100755 --- a/tutorial/toc.py +++ b/tutorial/toc.py @@ -1,39 +1,86 @@ #!/usr/bin/env python +# ruff: noqa G004 """CLI script to build a table of contents for an IPython notebook""" import argparse as ap +import logging import pathlib import re +import sys from collections import namedtuple import nbformat from nbformat import NotebookNode +__version__ = "0.1.1" + +# Set up logging +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger("toc") + TocEntry = namedtuple("TocEntry", ["level", "text", "anchor"]) def extract_markdown_cells(notebook: NotebookNode) -> str: - """Extract the markdown cells from a notebook""" + """Extract the markdown cells from a notebook + + Args: + notebook: A notebook object + + Returns: + str: Concatenated content of all markdown cells + """ return "\n".join( [cell.source for cell in notebook.cells if cell.cell_type == "markdown"] ) def extract_toc(notebook: str) -> list[TocEntry]: - """Extract the table of contents from a markdown string""" + """Extract the table of contents from a markdown string + + Parses markdown headings (lines starting with #) and converts them to TOC entries. + Each entry includes the heading level, text, and an anchor derived from the text. + + Args: + notebook: String containing markdown content + + Returns: + list[TocEntry]: List of table of contents entries + """ toc = [] line_re = re.compile(r"(#+)\s+(.+)") + line_num = 0 + for line in notebook.splitlines(): + line_num += 1 if groups := re.match(line_re, line): - heading, text, *_ = groups.groups() - level = len(heading) - anchor = "-".join(text.replace("`", "").split()) - toc.append(TocEntry(level, text, anchor)) + try: + heading, text, *_ = groups.groups() + level = len(heading) + + # Clean the text to make a proper anchor + clean_text = text.replace("`", "") + # Remove any other special characters that might break anchors + clean_text = re.sub(r"[^\w\s-]", "", clean_text) + anchor = "-".join(clean_text.lower().split()) + + toc.append(TocEntry(level, text, anchor)) + logger.debug(f"Found heading (level {level}): {text}") + except Exception as e: + logger.warning(f"Error processing heading at line {line_num}: {e}") + return toc def markdown_toc(toc: list[TocEntry]) -> str: - """Build a string representation of the toc as a nested markdown list""" + """Build a string representation of the toc as a nested markdown list + + Args: + toc: List of TocEntry objects + + Returns: + str: Markdown-formatted table of contents with proper indentation + """ lines = [] for entry in toc: line = f"{' ' * entry.level}- [{entry.text}](#{entry.anchor})" @@ -41,33 +88,69 @@ def markdown_toc(toc: list[TocEntry]) -> str: return "\n".join(lines) -def build_toc(nb_path: pathlib.Path, placeholder: str = "[TOC]") -> NotebookNode: - """Build a table of contents for a notebook and insert it at the location of a placeholder""" +def build_toc( + nb_path: pathlib.Path, + placeholder: str = "[TOC]", + toc_header: str = "# Table of Contents", +) -> tuple[NotebookNode, bool]: + """Build a table of contents for a notebook and insert it at the location of a placeholder + + Args: + nb_path: Path to the notebook file + placeholder: The text to replace with the generated TOC (default: "[TOC]") + toc_header: The header text to use for the TOC (default: "# Table of Contents") + + Returns: + tuple[NotebookNode, bool]: The notebook with TOC inserted and a boolean indicating if placeholder was found + """ # Read the notebook - nb_obj: NotebookNode = nbformat.read(nb_path, nbformat.NO_CONVERT) + try: + nb_obj: NotebookNode = nbformat.read(nb_path, nbformat.NO_CONVERT) + except Exception as e: + logger.error(f"Failed to read notebook '{nb_path}': {e}") + raise + md_cells = extract_markdown_cells(nb_obj) # Build tree toc_tree = extract_toc(md_cells) + if not toc_tree: + logger.warning(f"No headings found in notebook '{nb_path}'") + # Build toc representation toc_repr = markdown_toc(toc_tree) - # Insert it a the location of a placeholder - toc_header = "# Table of Contents" + # Insert it at the location of a placeholder + toc_replaced = False for cell in nb_obj.cells: if cell.source.startswith((placeholder, toc_header)): cell.source = f"{toc_header}\n{toc_repr}" cell.cell_type = "markdown" + toc_replaced = True + break + + if not toc_replaced: + logger.warning( + f"Placeholder '{placeholder}' or heading '{toc_header}' not found in notebook" + ) - return nb_obj + return nb_obj, toc_replaced def main(): """CLI entry point""" parser = ap.ArgumentParser( - description="Build a table of contents for an IPython notebook" + description="Build a table of contents for an IPython notebook", + epilog=""" + This script extracts headings from markdown cells in a Jupyter notebook and + generates a markdown-formatted table of contents. The TOC is inserted into + the notebook at the location of a placeholder (default: '[TOC]') or where + a '# Table of Contents' heading exists. Links in the TOC point to notebook + anchors created from the heading text. + """, + formatter_class=ap.RawDescriptionHelpFormatter, ) parser.add_argument("notebook", type=str, help="Path to the notebook to process") parser.add_argument( @@ -80,22 +163,75 @@ def main(): default=False, help="Force overwrite of original notebook", ) + parser.add_argument( + "--placeholder", + "-p", + type=str, + default="[TOC]", + help="Placeholder text to replace with the TOC (default: '[TOC]')", + ) + parser.add_argument( + "--header", + type=str, + default="# Table of Contents", + help="Header text for the TOC (default: '# Table of Contents')", + ) + parser.add_argument( + "--verbose", "-v", action="store_true", help="Enable verbose output" + ) + parser.add_argument( + "--version", action="version", version=f"%(prog)s {__version__}" + ) args = parser.parse_args() - if not (input_nb := pathlib.Path(args.notebook)).exists(): - raise FileNotFoundError(input_nb) - + # Set logging level based on verbosity + if args.verbose: + logger.setLevel(logging.DEBUG) + + # Validate input file + try: + input_nb = pathlib.Path(args.notebook) + if not input_nb.exists(): + logger.error(f"Input file not found: {input_nb}") + sys.exit(1) + if not input_nb.is_file(): + logger.error(f"Input path is not a file: {input_nb}") + sys.exit(1) + except Exception as e: + logger.error(f"Error processing input path: {e}") + sys.exit(1) + + # Set output file path if args.output is None: output_nb = input_nb.with_suffix(".toc.ipynb") else: output_nb = pathlib.Path(args.output) - with output_nb.open("w", encoding="utf-8") as file: - nbformat.write(build_toc(input_nb), file) - - if args.force: - input_nb.unlink() - output_nb.rename(input_nb) + # Create output directory if it doesn't exist + output_nb.parent.mkdir(parents=True, exist_ok=True) + + try: + # Generate TOC and write to output file + logger.info(f"Processing notebook: {input_nb}") + toc_notebook, toc_replaced = build_toc(input_nb, args.placeholder, args.header) + + if not toc_replaced: + logger.warning("Skipping output - no placeholder found in notebook") + sys.exit(0) # Exit with success code since it's not an error + + with output_nb.open("w", encoding="utf-8") as file: + nbformat.write(toc_notebook, file) + logger.info(f"TOC written to: {output_nb}") + + # Handle force option + if args.force: + logger.info(f"Replacing original notebook with TOC version") + input_nb.unlink() + output_nb.rename(input_nb) + logger.info(f"Original notebook replaced with: {input_nb}") + except Exception as e: + logger.error(f"Error processing notebook: {e}") + sys.exit(1) if __name__ == "__main__": From ab79959717bdfb007f6114b4c1f8fd97832fa5f6 Mon Sep 17 00:00:00 2001 From: Edoardo Baldi Date: Tue, 13 May 2025 07:31:23 +0200 Subject: [PATCH 2/4] [skip ci] Improve logging Replace f-strings with %-style formatting in logger calls for better performance. Use logger.exception() for better error reporting with tracebacks. --- tutorial/toc.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tutorial/toc.py b/tutorial/toc.py index 444a36dd..c020c567 100755 --- a/tutorial/toc.py +++ b/tutorial/toc.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# ruff: noqa G004 """CLI script to build a table of contents for an IPython notebook""" import argparse as ap @@ -12,7 +11,7 @@ import nbformat from nbformat import NotebookNode -__version__ = "0.1.1" +__version__ = "0.1.2" # Set up logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") @@ -65,9 +64,9 @@ def extract_toc(notebook: str) -> list[TocEntry]: anchor = "-".join(clean_text.lower().split()) toc.append(TocEntry(level, text, anchor)) - logger.debug(f"Found heading (level {level}): {text}") + logger.debug("Found heading (level %d): %s", level, text) except Exception as e: - logger.warning(f"Error processing heading at line {line_num}: {e}") + logger.warning("Error processing heading at line %d: %s", line_num, e) return toc @@ -106,8 +105,8 @@ def build_toc( # Read the notebook try: nb_obj: NotebookNode = nbformat.read(nb_path, nbformat.NO_CONVERT) - except Exception as e: - logger.error(f"Failed to read notebook '{nb_path}': {e}") + except Exception: + logger.exception("Failed to read notebook '%s'", nb_path) raise md_cells = extract_markdown_cells(nb_obj) @@ -116,7 +115,7 @@ def build_toc( toc_tree = extract_toc(md_cells) if not toc_tree: - logger.warning(f"No headings found in notebook '{nb_path}'") + logger.warning("No headings found in notebook '%s'", nb_path) # Build toc representation toc_repr = markdown_toc(toc_tree) @@ -133,7 +132,9 @@ def build_toc( if not toc_replaced: logger.warning( - f"Placeholder '{placeholder}' or heading '{toc_header}' not found in notebook" + "Placeholder '%s' or heading '%s' not found in notebook", + placeholder, + toc_header, ) return nb_obj, toc_replaced @@ -192,13 +193,13 @@ def main(): try: input_nb = pathlib.Path(args.notebook) if not input_nb.exists(): - logger.error(f"Input file not found: {input_nb}") + logger.error("Input file not found: %s", input_nb) sys.exit(1) if not input_nb.is_file(): - logger.error(f"Input path is not a file: {input_nb}") + logger.error("Input path is not a file: %s", input_nb) sys.exit(1) - except Exception as e: - logger.error(f"Error processing input path: {e}") + except Exception: + logger.exception("Error processing input path") sys.exit(1) # Set output file path @@ -212,7 +213,7 @@ def main(): try: # Generate TOC and write to output file - logger.info(f"Processing notebook: {input_nb}") + logger.info("Processing notebook: %s", input_nb) toc_notebook, toc_replaced = build_toc(input_nb, args.placeholder, args.header) if not toc_replaced: @@ -221,16 +222,16 @@ def main(): with output_nb.open("w", encoding="utf-8") as file: nbformat.write(toc_notebook, file) - logger.info(f"TOC written to: {output_nb}") + logger.info("TOC written to: %s", output_nb) # Handle force option if args.force: - logger.info(f"Replacing original notebook with TOC version") + logger.info("Replacing original notebook with TOC version") input_nb.unlink() output_nb.rename(input_nb) - logger.info(f"Original notebook replaced with: {input_nb}") - except Exception as e: - logger.error(f"Error processing notebook: {e}") + logger.info("Original notebook replaced with: %s", input_nb) + except Exception: + logger.exception("Error processing notebook") sys.exit(1) From a6a2c55f1384ef9e9aba88359b93d7dff670f9eb Mon Sep 17 00:00:00 2001 From: Edoardo Baldi Date: Tue, 13 May 2025 21:43:02 +0200 Subject: [PATCH 3/4] Fix TOC generation for headings inside code blocks Upgrade TocEntry from namedtuple to NamedTuple with type hints. Improve logic for detecting and ignoring markdown headers inside code blocks to prevent incorrect TOC entries. Also refactor the file output handling for better organization when using --force. --- tutorial/toc.py | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/tutorial/toc.py b/tutorial/toc.py index c020c567..ec5c682a 100755 --- a/tutorial/toc.py +++ b/tutorial/toc.py @@ -6,7 +6,7 @@ import pathlib import re import sys -from collections import namedtuple +from typing import NamedTuple import nbformat from nbformat import NotebookNode @@ -17,7 +17,13 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger("toc") -TocEntry = namedtuple("TocEntry", ["level", "text", "anchor"]) + +class TocEntry(NamedTuple): + """Table of contents entry""" + + level: int + text: str + anchor: str def extract_markdown_cells(notebook: NotebookNode) -> str: @@ -39,6 +45,7 @@ def extract_toc(notebook: str) -> list[TocEntry]: Parses markdown headings (lines starting with #) and converts them to TOC entries. Each entry includes the heading level, text, and an anchor derived from the text. + Ignores '#' symbols inside code blocks. Args: notebook: String containing markdown content @@ -49,9 +56,21 @@ def extract_toc(notebook: str) -> list[TocEntry]: toc = [] line_re = re.compile(r"(#+)\s+(.+)") line_num = 0 + is_code_block = False for line in notebook.splitlines(): line_num += 1 + + # Check if we're entering or exiting a code block + if line.strip().startswith("```"): + is_code_block = not is_code_block + continue + + # Skip header processing if we're in a code block + if is_code_block: + continue + + # Process headers only when not in a code block if groups := re.match(line_re, line): try: heading, text, *_ = groups.groups() @@ -220,16 +239,21 @@ def main(): logger.warning("Skipping output - no placeholder found in notebook") sys.exit(0) # Exit with success code since it's not an error - with output_nb.open("w", encoding="utf-8") as file: - nbformat.write(toc_notebook, file) - logger.info("TOC written to: %s", output_nb) + if not args.force: + logger.debug("Ignoring output file: %s", output_nb) - # Handle force option - if args.force: + with output_nb.open("w", encoding="utf-8") as file: + nbformat.write(toc_notebook, file) + + logger.info("TOC written to: %s", output_nb) + else: logger.info("Replacing original notebook with TOC version") - input_nb.unlink() - output_nb.rename(input_nb) + + with input_nb.open("w", encoding="utf-8") as file: + nbformat.write(toc_notebook, file) + logger.info("Original notebook replaced with: %s", input_nb) + except Exception: logger.exception("Error processing notebook") sys.exit(1) From 2024e0688067c2b428792008fa955e431cbfe5df Mon Sep 17 00:00:00 2001 From: Edoardo Baldi Date: Tue, 13 May 2025 21:49:37 +0200 Subject: [PATCH 4/4] Skip TOC header when generating table of contents --- tutorial/toc.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tutorial/toc.py b/tutorial/toc.py index ec5c682a..f95831e9 100755 --- a/tutorial/toc.py +++ b/tutorial/toc.py @@ -40,7 +40,7 @@ def extract_markdown_cells(notebook: NotebookNode) -> str: ) -def extract_toc(notebook: str) -> list[TocEntry]: +def extract_toc(notebook: str, toc_header: str) -> list[TocEntry]: """Extract the table of contents from a markdown string Parses markdown headings (lines starting with #) and converts them to TOC entries. @@ -49,17 +49,19 @@ def extract_toc(notebook: str) -> list[TocEntry]: Args: notebook: String containing markdown content + toc_header: Header text for the table of contents Returns: list[TocEntry]: List of table of contents entries """ toc = [] line_re = re.compile(r"(#+)\s+(.+)") - line_num = 0 is_code_block = False - for line in notebook.splitlines(): - line_num += 1 + for line_num, line in enumerate(notebook.splitlines(), start=1): + # Skip line if contains exactly the toc header + if line.strip() == toc_header: + continue # Check if we're entering or exiting a code block if line.strip().startswith("```"): @@ -70,7 +72,7 @@ def extract_toc(notebook: str) -> list[TocEntry]: if is_code_block: continue - # Process headers only when not in a code block + # Process headers if groups := re.match(line_re, line): try: heading, text, *_ = groups.groups() @@ -84,6 +86,7 @@ def extract_toc(notebook: str) -> list[TocEntry]: toc.append(TocEntry(level, text, anchor)) logger.debug("Found heading (level %d): %s", level, text) + except Exception as e: logger.warning("Error processing heading at line %d: %s", line_num, e) @@ -131,7 +134,7 @@ def build_toc( md_cells = extract_markdown_cells(nb_obj) # Build tree - toc_tree = extract_toc(md_cells) + toc_tree = extract_toc(md_cells, toc_header) if not toc_tree: logger.warning("No headings found in notebook '%s'", nb_path)