diff --git a/README.md b/README.md index bf1b9e7..a8f9c72 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,4 @@ If everything looks fine: ``` npm run deploy ``` -This will deploy the changes into the `gh-pages` branch, and the website will be updated in a few (2-3) minutes. We can check the status in the `Actions` tab in GitHub. +This will deploy the changes into the `gh-pages` branch, and the website will be updated in a few (2-3) minutes. We can check the status in the [Actions](https://github.com/marctorsoc/marctorsoc.github.io/actions) tab in GitHub. diff --git a/package-lock.json b/package-lock.json index 3d00bb2..66afa3e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,13 +1,14 @@ { - "name": "marctorsoc.github.io", + "name": "my-website", "version": "0.1.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "marctorsoc.github.io", + "name": "my-website", "version": "0.1.0", "dependencies": { + "@heroicons/react": "^2.2.0", "front-matter": "^4.0.2", "gray-matter": "^4.0.3", "katex": "^0.16.19", @@ -782,6 +783,15 @@ "node": ">=18" } }, + "node_modules/@heroicons/react": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@heroicons/react/-/react-2.2.0.tgz", + "integrity": "sha512-LMcepvRaS9LYHJGsF0zzmgKCUim/X3N/DQKc4jepAXJ7l8QxJ1PmxJzqplF2Z3FE4PqBAIGyJAQ/w4B5dsqbtQ==", + "license": "MIT", + "peerDependencies": { + "react": ">= 16 || ^19.0.0-rc" + } + }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", diff --git a/package.json b/package.json index 373cf20..bb00122 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,7 @@ "homepage": "https://marctorsoc.github.io", "private": true, "dependencies": { + "@heroicons/react": "^2.2.0", "front-matter": "^4.0.2", "gray-matter": "^4.0.3", "katex": "^0.16.19", diff --git a/public/content/display-foldable.jpg b/public/content/display-foldable.jpg new file mode 100644 index 0000000..b5b56ec Binary files /dev/null and b/public/content/display-foldable.jpg differ diff --git a/public/content/notebook-inline-value.jpg b/public/content/notebook-inline-value.jpg new file mode 100644 index 0000000..aed2754 Binary files /dev/null and b/public/content/notebook-inline-value.jpg differ diff --git a/public/content/notebook-run-cells-section.jpg b/public/content/notebook-run-cells-section.jpg new file mode 100644 index 0000000..c3997e9 Binary files /dev/null and b/public/content/notebook-run-cells-section.jpg differ diff --git a/public/content/page-repr.jpg b/public/content/page-repr.jpg new file mode 100644 index 0000000..daf5217 Binary files /dev/null and b/public/content/page-repr.jpg differ diff --git a/public/content/pipes-example.jpg b/public/content/pipes-example.jpg new file mode 100644 index 0000000..ec436c6 Binary files /dev/null and b/public/content/pipes-example.jpg differ diff --git a/public/content/render-df-with-multi-line-texts.jpg b/public/content/render-df-with-multi-line-texts.jpg new file mode 100644 index 0000000..7dddd03 Binary files /dev/null and b/public/content/render-df-with-multi-line-texts.jpg differ diff --git a/public/content/render-nested-false.jpg b/public/content/render-nested-false.jpg new file mode 100644 index 0000000..553db38 Binary files /dev/null and b/public/content/render-nested-false.jpg differ diff --git a/public/content/render-nested-true.jpg b/public/content/render-nested-true.jpg new file mode 100644 index 0000000..4a31886 Binary files /dev/null and b/public/content/render-nested-true.jpg differ diff --git a/public/content/render-nested-vs-not-nested.jpg b/public/content/render-nested-vs-not-nested.jpg new file mode 100644 index 0000000..dff6ad6 Binary files /dev/null and b/public/content/render-nested-vs-not-nested.jpg differ diff --git a/public/content/side_by_side_example.jpg b/public/content/side_by_side_example.jpg new file mode 100644 index 0000000..58ff7bf Binary files /dev/null and b/public/content/side_by_side_example.jpg differ diff --git a/src/components/MarkdownComponents.tsx b/src/components/MarkdownComponents.tsx new file mode 100644 index 0000000..c9c8792 --- /dev/null +++ b/src/components/MarkdownComponents.tsx @@ -0,0 +1,208 @@ +import React from 'react'; +import { Link } from 'react-router-dom'; +import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; +import { vscDarkPlus } from 'react-syntax-highlighter/dist/esm/styles/prism'; +import Math from './Math'; +import type { Components } from 'react-markdown'; +import { generateId } from '../utils/textUtils'; + +interface ReactElementWithChildren { + props: { + children?: React.ReactNode; + }; +} + +function isReactElementWithChildren(obj: any): obj is ReactElementWithChildren { + return obj && typeof obj === 'object' && 'props' in obj && typeof obj.props === 'object'; +} + +function processMathInText(text: string) { + const parts = text.split(/(\$\$[^\$]+\$\$|\\begin{[\s\S]*?\\end{[\s\S]*?}|\$[^\$]+\$)/g); + + return parts.map((part, i) => { + if (part.startsWith('$$') || (part.startsWith('\\begin') && part.endsWith('\\end'))) { + // Block math + const mathContent = part.startsWith('$$') + ? part.slice(2, -2) + : part; + return {mathContent}; + } + if (part.startsWith('$') && part.endsWith('$')) { + // Inline math + const mathContent = part.slice(1, -1); + return {mathContent}; + } + // Plain text + return part; + }); +} + +function getTextContent(children: React.ReactNode): string { + if (children === null || children === undefined) { + return ''; + } + if (typeof children === 'string' || typeof children === 'number') { + return children.toString(); + } + if (Array.isArray(children)) { + return children.map(getTextContent).join(''); + } + if (isReactElementWithChildren(children)) { + return getTextContent(children.props.children || ''); + } + return ''; +} + +const MarkdownComponents: Components = { + code({ node, inline, className, children, ...props }: any) { + const match = /language-(\w+)/.exec(className || ''); + const content = String(children).trim(); + + // Handle math blocks + if (content.startsWith('$') && content.endsWith('$')) { + const mathContent = content.slice(1, -1); + return {mathContent}; + } + if (content.startsWith('$$') && content.endsWith('$$')) { + const mathContent = content.slice(2, -2); + return {mathContent}; + } + + const language = match ? match[1] : undefined; + + return !inline && language ? ( + + {content} + + ) : + // inline blocks + ( + + {children} + + ); + }, + p(props) { + const { children, ...rest } = props; + + return ( +

+ {React.Children.map(children, child => { + if (typeof child === 'string') { + // Process text for inline and block math + const parts = processMathInText(child); + + // Return all parts together (text and inline math) + return parts; + } + return child; + })} +

+ ); + }, + a(props) { + const { href, children, ...rest } = props; + if (href && href.startsWith('/')) { + return ( + + {children} + + ); + } + return {children}; + }, + + strong(props) { + const { children, ...rest } = props; + return ( + + {React.Children.map(children, child => { + if (typeof child === 'string') { + return processMathInText(child); + } + return child; + })} + + ); + }, + em(props) { + const { children, ...rest } = props; + return ( + + {React.Children.map(children, child => { + if (typeof child === 'string') { + return processMathInText(child); + } + return child; + })} + + ); + }, + i(props) { + const { children, ...rest } = props; + return ( + + {React.Children.map(children, child => { + if (typeof child === 'string') { + return processMathInText(child); + } + return child; + })} + + ); + }, + img(props) { + return ( + + + ); + }, + + h1: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return

{children}

; + }, + h2: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return

{children}

; + }, + h3: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return

{children}

; + }, + h4: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return

{children}

; + }, + h5: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return
{children}
; + }, + h6: ({children}) => { + const text = getTextContent(children); + const id = generateId(text); + return
{children}
; + }, +}; + +export default MarkdownComponents; diff --git a/src/components/Math.tsx b/src/components/Math.tsx new file mode 100644 index 0000000..a237c2b --- /dev/null +++ b/src/components/Math.tsx @@ -0,0 +1,50 @@ +import katex from 'katex'; +import 'katex/dist/katex.min.css'; + +function Math({ children, inline = false }: { children: string, inline?: boolean }) { + let html = ''; + try { + html = katex.renderToString(children, { + displayMode: !inline, + throwOnError: false, + trust: true, + strict: false, + fleqn: false, + output: 'html', + maxSize: 10, + maxExpand: 1000, + globalGroup: true, + macros: { + "\\RR": "\\mathbb{R}", + "\\NN": "\\mathbb{N}", + "\\ZZ": "\\mathbb{Z}" + } + }); + } catch (error) { + console.error('KaTeX render error:', error, 'Input LaTeX:', children); + html = `${children}`; + } + + return inline ? ( + + ) : ( + + ); +} + +export default Math; diff --git a/src/components/Post.tsx b/src/components/Post.tsx new file mode 100644 index 0000000..e69de29 diff --git a/src/components/PostCard.tsx b/src/components/PostCard.tsx index eb88ac4..484ac2e 100644 --- a/src/components/PostCard.tsx +++ b/src/components/PostCard.tsx @@ -1,63 +1,11 @@ -import React from 'react'; import { Post } from '../types/Post'; import ReactMarkdown from 'react-markdown'; import { Link } from 'react-router-dom'; import remarkGfm from 'remark-gfm'; -import katex from 'katex'; import rehypeRaw from 'rehype-raw'; -import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; -import { vscDarkPlus } from 'react-syntax-highlighter/dist/esm/styles/prism'; -import 'katex/dist/katex.min.css'; -import type { Components } from 'react-markdown'; +import MarkdownComponents from './MarkdownComponents'; import { categoryColors } from '../utils/constants'; - -function Math({ children, inline = false }: { children: string, inline?: boolean }) { - let html = ''; - try { - html = katex.renderToString(children, { - displayMode: !inline, - throwOnError: false, - trust: true, - strict: false, - fleqn: false, - output: 'html', - maxSize: 10, - maxExpand: 1000, - globalGroup: true, - macros: { - "\\RR": "\\mathbb{R}", - "\\NN": "\\mathbb{N}", - "\\ZZ": "\\mathbb{Z}" - } - }); - } catch (error) { - console.error('KaTeX render error:', error, 'Input LaTeX:', children); - html = `${children}`; - } - - return inline ? ( - - ) : ( - - ); -} - - +import { TableOfContents } from './TableOfContents'; function preprocessContent(content: string): string { // First, normalize line breaks in align environments @@ -90,7 +38,7 @@ function preprocessContent(content: string): string { const cleanBody = body .trim() .replace(/\\\s*\n/g, '\n') - .replace(/(?:\\!){2,}/g, '\\!') + .replace(/(?:\\!){2,}/g, '\\!') // Add missing quote here .replace(/(?:\\,){2,}/g, '\\,'); return `${start}${cleanBody}${end}`; // Keep \begin{equation}...\end{equation} } @@ -99,141 +47,10 @@ function preprocessContent(content: string): string { return processed; } -function processMathInText(text: string) { - const parts = text.split(/(\$\$[^\$]+\$\$|\\begin{[\s\S]*?\\end{[\s\S]*?}|\$[^\$]+\$)/g); - - return parts.map((part, i) => { - if (part.startsWith('$$') || (part.startsWith('\\begin') && part.endsWith('\\end'))) { - // Block math - const mathContent = part.startsWith('$$') - ? part.slice(2, -2) - : part; - return {mathContent}; - } - if (part.startsWith('$') && part.endsWith('$')) { - // Inline math - const mathContent = part.slice(1, -1); - return {mathContent}; - } - // Plain text - return part; - }); -} - - - -const MarkdownComponents: Components = { - code({ node, inline, className, children, ...props }: any) { - const match = /language-(\w+)/.exec(className || ''); - const content = String(children).trim(); - - // Handle math blocks - if (content.startsWith('$') && content.endsWith('$')) { - const mathContent = content.slice(1, -1); - return {mathContent}; - } - if (content.startsWith('$$') && content.endsWith('$$')) { - const mathContent = content.slice(2, -2); - return {mathContent}; - } - - const language = match ? match[1] : undefined; - - return !inline && language ? ( - - {content} - - ) : ( - - {children} - - ); - }, - p(props) { - const { children, ...rest } = props; - - return ( -

- {React.Children.map(children, child => { - if (typeof child === 'string') { - // Process text for inline and block math - const parts = processMathInText(child); - - // Return all parts together (text and inline math) - return parts; - } - return child; - })} -

- ); - }, - a(props) { - const { href, children, ...rest } = props; - if (href && href.startsWith('/')) { - return ( - - {children} - - ); - } - return {children}; - }, - - strong(props) { - const { children, ...rest } = props; - return ( - - {React.Children.map(children, child => { - if (typeof child === 'string') { - return processMathInText(child); - } - return child; - })} - - ); - }, - em(props) { - const { children, ...rest } = props; - return ( - - {React.Children.map(children, child => { - if (typeof child === 'string') { - return processMathInText(child); - } - return child; - })} - - ); - }, - i(props) { - const { children, ...rest } = props; - return ( - - {React.Children.map(children, child => { - if (typeof child === 'string') { - return processMathInText(child); - } - return child; - })} - - ); - }, - img(props) { - return ( - - - ); - } -}; - interface PostCardProps { post: Post; isPinned?: boolean; + isArchived?: boolean; showFullContent?: boolean; maxPreviewChars?: number; compact?: boolean; @@ -300,6 +117,17 @@ export function PostCard({ post, showFullContent = false, compact = false }: Pos ))} + {showFullContent && post.toc && post.toc.length > 0 && ( + <> + +
+ + )} + + {post.heroImage && showFullContent && (
- )} + )}
diff --git a/src/components/TableOfContents.tsx b/src/components/TableOfContents.tsx new file mode 100644 index 0000000..27fb50d --- /dev/null +++ b/src/components/TableOfContents.tsx @@ -0,0 +1,166 @@ +import React, { useState } from 'react'; +import { Header } from '../utils/tocUtils'; +import ReactMarkdown from 'react-markdown'; +import remarkGfm from 'remark-gfm'; +import { ChevronDownIcon, ChevronRightIcon } from '@heroicons/react/24/outline'; + +interface TableOfContentsProps { + headers: Header[]; + className?: string; + indentSize?: number; // New prop for configurable indentation +} + +interface NumberedHeader extends Header { + number: string; + children: NumberedHeader[]; // Remove optional marker, always initialize as array +} + +function generateNumberedHeaders(headers: Header[]): NumberedHeader[] { + const minLevel = Math.min(...headers.map(h => h.level)); + const result: NumberedHeader[] = []; + const stack: NumberedHeader[] = []; + const counters = new Map(); + + headers.forEach(header => { + const relativeLevel = header.level - minLevel; + + // Initialize or increment counter for this level + counters.set(relativeLevel, (counters.get(relativeLevel) || 0) + 1); + + // Reset counters for deeper levels + for (const [level, _] of counters) { + if (level > relativeLevel) { + counters.delete(level); + } + } + + // Generate number based on all parent levels + const number = Array.from({ length: relativeLevel + 1 }, (_, i) => + counters.get(i) || 1 + ).join('.'); + + const numbered: NumberedHeader = { + ...header, + number, + children: [] // Always initialize as empty array + }; + + // Pop stack until we find the parent or reach the root + while (stack.length > 0 && stack[stack.length - 1].level >= header.level) { + stack.pop(); + } + + if (stack.length > 0 && stack[stack.length - 1].level < header.level) { + // Add as child to the last item in stack + stack[stack.length - 1].children.push(numbered); + } else { + // Add to root level + result.push(numbered); + } + + stack.push(numbered); + }); + + return result; +} + +const TOCItem: React.FC<{ + header: NumberedHeader; + minLevel: number; + indentSize?: number; +}> = ({ header, minLevel, indentSize = 0 }) => { + const [isExpanded, setIsExpanded] = useState(true); + const hasChildren = header.children.length > 0; // Remove optional chaining + const isTopLevel = header.level === minLevel; + + return ( +
  • + + {hasChildren && isExpanded && ( +
      + {header.children.map(child => ( + + ))} +
    + )} +
  • + ); +}; + +export const TableOfContents: React.FC = ({ + headers, + className = '', + indentSize = 0.5 // Adjusted default indent size +}) => { + if (!headers || headers.length === 0) return null; + + const numberedHeaders = generateNumberedHeaders(headers); + const minLevel = Math.min(...headers.map(h => h.level)); + + return ( + + ); +}; + +export default TableOfContents; diff --git a/src/components/layout/Navigation.tsx b/src/components/layout/Navigation.tsx index c842fae..8c92233 100644 --- a/src/components/layout/Navigation.tsx +++ b/src/components/layout/Navigation.tsx @@ -1,10 +1,16 @@ -import { useContext, useState } from 'react'; -import { NavLink } from 'react-router-dom'; +import { useContext, useEffect, useState } from 'react'; +import { NavLink, useLocation } from 'react-router-dom'; import { DarkModeContext } from '../../utils/darkMode'; export default function Navigation() { const { isDarkMode, toggleDarkMode } = useContext(DarkModeContext); const [isMenuOpen, setIsMenuOpen] = useState(false); + const location = useLocation(); + + // Scroll to top on route change + useEffect(() => { + window.scrollTo(0, 0); + }, [location.pathname]); const toggleMenu = () => setIsMenuOpen(!isMenuOpen); const closeMenu = () => setIsMenuOpen(false); diff --git a/src/content/posts/2017-11-06-python-tips-tricks.md b/src/content/posts/2017-11-06-python-tips-tricks.md index 9dbba6a..afdfb93 100644 --- a/src/content/posts/2017-11-06-python-tips-tricks.md +++ b/src/content/posts/2017-11-06-python-tips-tricks.md @@ -3,19 +3,16 @@ title: Python Tips & Tricks date: 2017-11-06T00:00:00+00:00 permalink: /posts/python-tips-tricks categories: - - Coding + - Dev tools isPinned: false +isArchived: true --- -

    Today I'm going to explain some of the lessons and tools I've found that could be useful for someone developing data science (and also general purpose projects) in Python:

    +Today I'm going to explain some of the lessons and tools I've found that could be useful for someone developing data science (and also general purpose projects) in Python. - * Ipython - * Notebooks - * Tricks for python in a terminal +For the readers already working in Python, probably some of these items are already known. But I love shortcuts and ways to save time, so maybe you can still find something useful in what follows. -

    For the readers already working in Python, probably some of these items are already known. But I love shortcuts and ways to save time, so maybe you can still find something useful in what follows.

    - -**IPython** +## IPython

    An improved version of the typical python shell. Actually the "i" comes from "interactive", but I would rather say that it's more an improved version.

    @@ -31,7 +28,7 @@ In case you are in Ubuntu, you can also run sudo apt install python-ipython

    -

    Hey, it's not a mistake. , since last versions, apt-get has been simplified to just apt. The advantages of ipython w.r.t. classical python shell are many, let's enumerate some:

    +Hey, it's not a mistake. As of last versions, `apt-get` has been simplified to just apt. The advantages of ipython w.r.t. classical python shell are many, let's enumerate some: *

    As in a unix terminal, you can autocomplete and see suggestions using the tab command. Something really appreciated when introducing filenames, names of variables, etc.

    *

    Using the alt key, ipython allows to move the cursor word by word. This feature is not available (at least in my Mac) when using the classical python shell

    @@ -43,7 +40,7 @@ In case you are in Ubuntu, you can also run

    -Profiles +### Profiles

    Have you ever been programming in your favorite IDE and thought: "Is numpy/pandas accepting this? I'm gonna try in a terminal a toy example, just to be sure". Then you go to a terminal, write [i]pythonΒ and you have to write, for 334th time in a week, the famous:

    @@ -74,7 +71,7 @@ In case you are in Ubuntu, you can also run 1. Modify the default profile, found in the same directory as the profiles we create 2. Create an alias, see last section in this post -**Notebooks** +## Notebooks

    Jupyter notebooks are really powerful environments where you can develop applications not only in Python, but also other programming languages such as R. They are a complete world, and I'm not gonna explain the entire list of features they have. See an example here. As you can see they might be useful to present work to other people, but also to have a more dynamic environment where you can run just some pieces of the code, so standing as an intermediate player between the terminal an running code in IDEs.

    @@ -109,7 +106,7 @@ and you can run it by

    Another feature I like is notebook themes. Some of us don't like to code in a black on white schema (white background, black fonts) , though there is a lot of controversy about this. To be honest, before writing this post I always thought that it was healthier for my eyes, but it turns out that it depends on the environment light, and also everyone eyesight. In any case, if you feel better or at least the same with dark themes, you can do your bit and saving battery and energy, which is both good for your pocket and your planet. Instructions can be found at the link.

    - **Python in a terminal** + ## Python in a terminal

    I highly recommend working in an Integrated Development Environment (IDE) to develop code and use Version Control System (VCS). My favorites are Pycharm and Git, respectively. They are free, popular, and enough for almost any task. However, in some situations we prefer/have to work in a python shell. Here I give you some tips and tricks to improve your experience in that situation.

    @@ -144,7 +141,7 @@ These alias just create shortcuts to save time, e.g: *

    control+z: send to sleep (background) current process. This can be useful if you want to try something in terminal without losing your workspace in Python. Or if you want to work with two different python environments at the same time, since typing Python will start a new and completely independent environment. To return to the last slept process, run fg (foreground). A list of the current processes in a terminal can be obtained by running jobs. More info about this here.

    *

    I love using *home* and *end*Β buttons, but in my Mac I don't have them, and first days I was really disappointed. In some applications, such as in the browser, you can move cursor to beginning and end of line by pressing cmd+left/right cursor, but it doesn't work in the terminal. In such a case, the default shortcut is control+a and control+e. This works for the ipython shell as well.

    - **Conclusion** + ## Conclusion

    Today we have reviewed some tips and tricks for working with Python in a more agile way. There are many many other things that I could recommend, but they'll probably be matter of future posts. Mainly, we have reviewed the usefulness of ipython as a better interface to run python commands rather than the classical shell; notebooks as an innovate way of working and presenting work with Python; and finally some tips and tricks for using Python and related tools in a terminal.

    diff --git a/src/content/posts/2017-12-20-christmas-routes.md b/src/content/posts/2017-12-20-christmas-routes.md index e21d63b..e371d3f 100644 --- a/src/content/posts/2017-12-20-christmas-routes.md +++ b/src/content/posts/2017-12-20-christmas-routes.md @@ -3,7 +3,7 @@ title: Christmas routes date: 2017-12-20T22:43:43+00:00 permalink: /posts/christmas-routes/ categories: - - AI + - Coding puzzles heroImage: /content/santa.jpg heroImageWidth: 80% heroImageHeight: 50% diff --git a/src/content/posts/2018-04-08-dc-dungeon-code.md b/src/content/posts/2018-04-08-dc-dungeon-code.md index 608223b..0129a3a 100644 --- a/src/content/posts/2018-04-08-dc-dungeon-code.md +++ b/src/content/posts/2018-04-08-dc-dungeon-code.md @@ -3,8 +3,7 @@ title: "DC: Dungeon & Code" date: '2018-04-08 22:43:44 +0100' permalink: /posts/dc-dungeon-code/ categories: -- Coding -- AI +- Coding puzzles heroImage: /content/yX1T7D.png heroImageWidth: 70% --- diff --git a/src/content/posts/2019-02-08-data-science-in-remote.md b/src/content/posts/2019-02-08-data-science-in-remote.md deleted file mode 100644 index 6e82da8..0000000 --- a/src/content/posts/2019-02-08-data-science-in-remote.md +++ /dev/null @@ -1,175 +0,0 @@ ---- -title: Data Science in remote -date: '2019-02-08 00:00:00 +0000' -permalink: /posts/data-science-in-remote/ -categories: -- AI -- Coding -- Work -isPinned: true ---- - -Today we'll see some tips and tricks when working (but not only) on -remote. This is typical in a data scientist life as layman laptops, at some point, -and especially if you work with big data, entail limitations in memory and/or -speed. - -In most companies or in academia, when you need -to run a huge task you are given a remote machine, either in AWS or in a company/uni -cluster. This means that you won't have physical access, and you can only connect -there remotely. - - **Disclaimer**: all info written in this post assumes a Mac as your local, and Ubuntu as remote. Most of it should work also with other combinations of these two. - -## Access to a remote machine - -Accessing to the cluster usually is as easy as writing: - - ```ssh username@ip``` - -This will require you to write your password, but there is a solution to avoid writing your password all the time (all from your local, source): - -1. `ssh-keygen -t rsa` -1. `ssh username@ip mkdir -p .ssh` -1. `cat .ssh/id_rsa.pub | ssh username@ip 'cat >> .ssh/authorized_keys'` - -In addition to this, I like to define my alias to access to it, so adding to the `.bashrc` file something like: - -` -alias sshmarc=ssh username@ip -` - -so that just typing something short I can access to it. - -## File transfer - -Always do this from your local machine. Examples: - -From cluster to local (the dot means that it will be copied to the current directory): - -`scp marc@172.16.6.35:~/titanic.log .` - -From local to cluster: - -`scp -r ~/experiments/results marc@172.16.6.35:~/results/` - -A way to just synchronise is using rsync: - -` -rsync -arv --ignore-existing --progress marc@172.16.6.32:/home/marc/results experiments/results/` - -## Run shells on background - -The main application of having a server is not only to run experiments when you're connected, but to leave it the whole night, so that you don't have to wait. However, if you close the ssh connection, you'll lose everything you had. A workaround for that is running the processes on background, or using a terminal multiplexer. While I started with `screen`, I've finally adopted `tmux` as my favorite one. A short cheatsheet next: - -* `tmux ls` -* `tmux new -s name` -* `tmux attach -t name` -* control + b, and then d => detach -* `killall tmux` - -## Notebooks - -As explained in -this post, Jupyter notebooks are very powerful tools for Python easy prototyping, but also for intensive development. However, one typically runs the jupyter server in local, and connect via browser. How do we do this when we want the Python to run in our remote box? - -1. Install Jupyter lab in the remote box to be able to run the notebook server: `conda install jupyterlab` - -1. `jupyter lab --no-browser --port=8089.` - -1. Now in your local terminal run: `ssh -N -L 8000:localhost:8089 marc@172.16.6.32`. - -There will be no answer, just leave this open. -This creates a tunnel from your port 8000 to the port 8089 in the server (these ports are examples and can be changed to any number), where the jupyter server is listening. Note that if you -have multiple servers, they can all listen in the same port, but you have to tunnel them to different ports, so changing the 8000! - -Open a browser and go to `localhost:8000`. The password in step2 will be asked, and you should be able to work as in local. - -Optional: add the tunnel as in `ssh -fN -L 8000:localhost:8089 marc@172.16.6.32` -to your `~/.bashrc`, and it will be active but in the background, and started every time you open a new terminal. So no need to have a terminal blocked (but do not close it!). -To make it effective either restart terminal or `source ~/.bashrc`. - - -## Jupyter lab as a service - -After previous section, you're able to run notebooks on the server, and accessing to them via browser. So, even though it says `localhost:8000`, you're in the server! (tunnels dark magic). However, it's really annoying going to the server and start the sever every time. This can be automated by running it as a service source: - -* Set the service file `/usr/lib/systemd/system/jupyter.service` (yes, you probably need to create some dirs) as in -```bash -[Unit] -Description=Jupyter Lab - -[Service] -Type=simple -PIDFile=/run/jupyter.pid -ExecStart=/home/marc/miniconda3/bin/jupyter lab --no-browser --port=8089 -User=marc -Group=marc -WorkingDirectory=/home/marc -Restart=always -RestartSec=10 -#KillMode=mixed -[Install] -WantedBy=multi-user.target -``` - -* `sudo systemctl enable jupyter.service` -* `sudo systemctl daemon-reload` -* `jupyter lab --generate-config` -* `jupyter lab password` -* `sudo systemctl restart jupyter.service` - -## Notebook tips and tricks - -### Autoreload - -(source) - -When you change something in sources, usually you have to restart the kernel. This allows to automatically import functions again. Just add - -`%load_ext autoreload`
    -`%autoreload 2` - -### Table of contents - -Check out this useful [extension](https://github.com/jupyterlab/jupyterlab-toc). -Especially interesting when you're writing a tutorial out of a notebook. - -### Kernels auto-discovery - -Check out [this](https://github.com/Anaconda-Platform/nb_conda_kernels), allowing -you to have available every conda environment irrespective of which environment -you launched the server from. With this enabled, the `python -m spacy ipykernel ...` -in my previous post is no more required. - -### Jupytext - -Turn your notebooks into .py files automatically synchronised, see [this](https://github.com/mwouts/jupytext). Many advantages: -* Good to keep track in version control -* In this regard, when submitting a Pull Request this makes easier to comment on -notebooks if necessary -* One can potentially apply [black](https://github.com/psf/black) to auto-format the code in your notebok by -applying *black* to your .py file and then synchronising with the notebook - - -### Display various dataframes side by side - - -Use the following code: - -
    - -

    - -and even nicer, if you have a list of dataframes, the following will show them in rows of 3 columns: - - -
    - -

    - - ## Conclusion - -In this post, I have shown some of the tricks I have learned in the recent years for working on remote and with Jupyter notebooks. - -As always, any recommendation, suggestion or improvement, please let me know. Thanks for reading! diff --git a/src/content/posts/2019-02-08-jupyter-lab-on-a-remote-machine.md b/src/content/posts/2019-02-08-jupyter-lab-on-a-remote-machine.md new file mode 100644 index 0000000..1e5c0a7 --- /dev/null +++ b/src/content/posts/2019-02-08-jupyter-lab-on-a-remote-machine.md @@ -0,0 +1,73 @@ +--- +title: JupyterLab on a Remote Machine +date: '2019-02-08 00:00:00 +0000' +permalink: /posts/jupyter-lab-on-a-remote-machine/ +categories: +- Dev tools +isPinned: false +--- + +Today we'll see my config for running Jupyter notebooks when working on a Remote Machine. This is typical in a data scientist / ML engineer life as typical laptops eventually cannot process tasks due to limitations on memory, speed or disk space. + + **Disclaimer**: all info written in this post assumes a Mac as your local, and Ubuntu as remote. Most of it should work also with other combinations of these two. + + +## Notebooks + +Jupyter notebooks are very powerful tools for easy prototyping in Python (or even R!), but also for intensive development. However, one typically runs the Jupyter server in local, and connect via browser. How do we do this when we want the Python to run in our remote box? + +1. Install JupyterLab in the remote box to be able to run the notebook server, e.g. `conda install jupyterlab` if working with `conda`. + +1. `jupyter lab --no-browser --port=8089.` + +1. Now in your local terminal run: `ssh -N -L 8000:localhost:8089 marc@172.16.6.32`. + +There will be no answer, just leave this open. +This creates a tunnel from your port `8000` to the port `8089` in the remote machine (these ports are just examples and can be changed to any number), where the Jupyter server is listening. Note that if you +have multiple servers, they can all listen in the same port e.g. `8089`, but you have to tunnel them to different *local* ports, so changing the `8000`! + +Open a browser and go to `localhost:8000`, and that's it! + +**Optional**: add the tunnel as in `ssh -fN -L 8000:localhost:8089 marc@172.16.6.32` +to your `~/.bashrc`. This will make the tunnel always active but in the background, and started every time you open a new terminal. So no need to have a terminal blocked (but do not close it!). +To make it effective either restart the terminal or `source ~/.bashrc`. + + +## JupyterLab as a service + +Now you're able to run notebooks on the remote machine, and accessing to them via browser. So, even though it says `localhost:8000`, you're in the server! However, it's really annoying going to the remote machine and start the JupyterLab server every time. This can be automated by running it as a service source: + +* Set the service file `/usr/lib/systemd/system/jupyter.service` (yes, you probably need to create some dirs) as in +```bash +[Unit] +Description=JupyterLab + +[Service] +Type=simple +PIDFile=/run/jupyter.pid +ExecStart=/home/marc/miniconda3/bin/jupyter lab --no-browser --port=8089 +User=marc +Group=marc +WorkingDirectory=/home/marc +Restart=always +RestartSec=10 +#KillMode=mixed +[Install] +WantedBy=multi-user.target +``` + +* `sudo systemctl enable jupyter.service` +* `sudo systemctl daemon-reload` +* `jupyter lab --generate-config` +* `jupyter lab password` +* `sudo systemctl restart jupyter.service` + + +and that's it. Every time the machine starts, will start this service and let it listen for incoming requests. + + + ## Conclusion + +In this short post, I have shown how to spin up a Jupyter server in your remote machine. This way we can run notebooks in the remote machine with the same experience as if the Jupyter server was running locally! + +As always, any recommendation, suggestion or improvement, please let me know. Thanks for reading! diff --git a/src/content/posts/2019-11-08-easy-recipes-for-a-(bit)-better-world.md b/src/content/posts/2019-11-08-easy-recipes-for-a-(bit)-better-world.md index 14738e0..4f8b8dd 100644 --- a/src/content/posts/2019-11-08-easy-recipes-for-a-(bit)-better-world.md +++ b/src/content/posts/2019-11-08-easy-recipes-for-a-(bit)-better-world.md @@ -59,7 +59,7 @@ different extensions my recommendation is to use /bin/bash +``` +Note that `exec` runs a command within a running container, while `run` starts the container and then runs the command. And by the way, to see the logs of such running container, we can do: +```bash +docker logs -f +``` + + +### What's the difference between `CMD` and `ENTRYPOINT`? + +The `CMD` and `ENTRYPOINT` are both used to specify a default command or program that should be executed when the container starts. However, they have different behaviors: +- `ENTRYPOINT` specifies a program that will be executed when the container starts. It can take arguments, but it's an executable program itself. It is overwritten by using `--entrypoint` with `docker run`. +- `CMD` specifies a default command that will be executed when the container starts. It can take arguments, but it's not an executable program itself. It is overwritten by adding arguments at the end of `docker run`, like we did with `bash`. + +Note that `CMD` can be used in conjuction with `ENTRYPOINT`, to give arguments to the executable from `ENTRYPOINT`, but that's a bit of a hack. + +### How to open a JupyterLab server to play from within the Docker container? + +This might be a way to check dependencies, outputs or to debug in general with a nicer interace. First, let's run like this: +```bash +docker run -p 8000:80 -p 8000:81 -it fastapi-with-uv-example bash +``` +This way we have one port for the FastAPI server and another for the JupyterLab server. Now, we would create a tmux session and run the FastAPI server, that we can check at http://localhost:8000/. Then we can open another tmux session and run: +```bash +uv add jupyterlab +uv run jupyter lab --port 81 --no-browser -β€”allow-root --ip "0.0.0.0" +``` +Note that 1) it’s not "host" but "ip", and 2) we need the IP 0.0.0.0 to bind all network interfaces, which allows external connections to reach the JupyterLab server. + + +## Memory issues + +During my time at Globality, I experimented several issues when training ML models, that were related to memory. That's why I found a bunch of tips and tricks that might be useful if you're in a similar situation. + +### Ensuring all memory is shared + +This is a bit more complicated that the question I posed, but if you are debugging and cannot make a high-performance computing application to work (like when training an ML model), it might help to add `--ipc=host` to the `docker run` command. See full reference in the Docker docs, since it comes with risks. + +### Limiting container memory + +To properly simulate the environment in production, you can limit the memory available to the container by using the `--memory` and `--memory-swap` flags (plus a few others, see Docker documentation). + +Here an example: +```bash +docker run --memory="10000m" --memory-swap=0 -it -v /":" --gpus all bash +``` +For ubuntu, this capability is not available immediately after Docker installation. See [this page](https://www.serverlab.ca/tutorials/containers/docker/how-to-limit-memory-and-cpu-for-docker-containers/) for instructions to enable it (verified on ubuntu 18.04). +To verify that the memory limit is applied once the container is running, and to see memory usage in real time, run `docker stats`. + +### Checking memory limits within Docker container + +The `docker run` container can be run with memory limits, as explained in [Check mem_limit within a docker container](https://stackoverflow.com/questions/42187085/check-mem-limit-within-a-docker-container). To check all limits that are set, add the following to one of the first lines of code that are run within the app: +```python +try: + os.system("more /sys/fs/cgroup/memory/memory.* | cat") +except Exception as e: + self.logger.info(e) + self.logger.info("Problem running `more /sys/fs/cgroup/memory/memory.* | cat`") +``` +See [here](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt) for explanation of the fields. The most relevant are: + +* memory.soft_limit_in_bytes + +* memory.limit_in_bytes + +* memory.memsw.limit_in_bytes + +* memory.kmem.limit_in_bytes + +You might also look at the `*usage_in_bytes` just to double-check the memory at that point is low. + + +## Problems with gpus + +First make sure you are passing `--gpus all` to `docker run`. Then, you might also need + +```bash +sudo apt install nvidia-container-toolkit +sudo apt install nvidia-cuda-toolkit +sudo systemctl restart docker +``` + +Now, e.g: + +```bash +watch -n 0.5 nvidia-smi +``` +to keep track of GPU memory every 0.5 seconds. + +## Rebuilding the Dockerfile + +The following will give you an approximate idea of how the Docker container was created. Note that this will show ALL parents, so if you container inherits from others, it will include those as well: + + +```bash +docker history --no-trunc -q > dockerfile_recreated +``` + +## Docker Compose + +If your application consists of multiple containers, Docker Compose simplifies management. It uses a `docker-compose.yml` file to define the services (containers) and their relationships. Here is an example of an app having backend, frontend, and a database: + +```yaml +version: "3.9" + +services: + backend: + build: ./backend + ports: + - "8000:8000" + depends_on: + - db + environment: + - DATABASE_URL=postgresql://user:password@db:5432/dbname + volumes: + - db_data:/var/lib/postgresql/data + + frontend: + build: ./frontend + ports: + - "3000:80" + depends_on: + - backend + + db: + image: postgres:13-alpine + ports: + - "5432:5432" + environment: + - POSTGRES_USER=user + - POSTGRES_PASSWORD=password + - POSTGRES_DB=dbname + volumes: + - db_data:/var/lib/postgresql/data + +volumes: + db_data: +``` +And the essential commands here: +* **`docker-compose up`:** Builds (if necessary) and starts all the services defined in the `docker-compose.yml` file. Add `-d` for detached mode. +* **`docker-compose down`:** Stops and removes the containers defined in the `docker-compose.yml` file. diff --git a/src/content/posts/2025-01-10-uv-and-warp.md b/src/content/posts/2025-01-10-uv-and-warp.md new file mode 100644 index 0000000..eeb7525 --- /dev/null +++ b/src/content/posts/2025-01-10-uv-and-warp.md @@ -0,0 +1,174 @@ +--- +title: uv and warp +date: '2025-01-10 00:00:00 +0000' +permalink: /posts/uv-and-warp +categories: +- Dev tools +isPinned: true +--- + +Today, I'm sharing a couple of tools I've been using recently and have been real **game-changers**. On the one hand, [uv](https://docs.astral.sh/uv) is an **ultra-fast** Python Package Manager that comes to replace [pip-tools](https://github.com/jazzband/pip-tools) and [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) in my case. On the other hand, [warp](https://www.warp.dev/) is an **AI-powered terminal** that made me move from [iterm2](https://iterm2.com/). Let's get started πŸš€ + +## **uv**: The Rust-Powered Python Package Manager + +### A little bit of my background + +I've been using Python for years, and like many data scientists, I started using [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) for the Python environment management. With [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) I would: +- **Environment management**: `conda` creates isolated environments, that can be used to manage each project, so each project can have its own set of dependencies, their versions, but also potentially its **own Python version**. These environments are typically installed in the home directory, separated from the code. This was "the Python way" for me, even though for my pet project [LinguaQuest](https://play.google.com/store/apps/details?id=com.marc.torsoc.linguaquest) I was using `npm` and there, it looked to me more like the native `venv` module I was avoiding with `conda`. +- **Package Management for Non-Python Dependencies**: A major strength of `conda` is its ability to manage dependencies that are not Python packages. Many scientific computing packages rely on underlying C/C++ libraries, Fortran libraries, or other system-level dependencies. Conda can install and manage these dependencies, which pip struggles with. +- **Binary Package Installations**: `Conda` often installs packages as pre-compiled binaries. This can be significantly faster than building packages from source, which is often what `pip` does. + +Then I would have a `setup.py` declaring the dependencies for the project and `pip install` it. + +However, when working in production projects, you want to make sure that the environment is reproducible. So you pin dependencies. The classical simple approach is to `pip freeze > requirements.txt`. + +But then, upgrading dependencies becomes a nightmare because we're pinning sub-dependencies. And conflict resolution is solved by pip, which a) might not be accurate, b) is terribly slow. Sometimes it has to download the package to understand its dependencies. + +At the end of my adventure with Eigen, we started using [poetry](https://python-poetry.org/). This was solving some problems with dependency resolution, but was **realy really slow**. + +Later, during my time at Globality we used [pip-tools](https://github.com/jazzband/pip-tools) to define projects' dependencies. This would look into the dependencies declared in `setup.py`, and generate a `requirements.txt` file. It was working well for most of the issues, but still, was really slow. + +### uv to the rescue! + +Built in Rust, `uv` resolves dependencies [10–100x faster](https://medium.com/@vanessagaigher/python-package-managers-is-uv-really-faster-than-poetry-478da7ff43e4) than `pip/poetry`. It also uses global caching and parallel processing to minimize redundant downloads (e.g., cached installations take ~2s vs 41s for pip). `uv` looks a bit closer to `npm` for Node.js than `conda`. It also creates a `uv.lock` file, but you just manage the `pyproject.toml` file. And the environment is created right inside the project within the `.venv` directory. In their words, it is + +
    +A single tool to replace pip, pip-tools, pipx, poetry, pyenv, twine, virtualenv, and more. +
    + +and for me, this means that I could remove `conda` and `pip-tools` from my workflow, and just stick to `uv`. + + +### Basic Commands +There is great [documentation](https://docs.astral.sh/uv), and it's not really worth replicating it here. + +#### Project Initialization + Create a new Python project with `uv init`, which generates a `pyproject.toml` file and basic project structure: + ```bash + uv init myproject # Creates project files and a virtual environment + ``` + Now you'll see + + ```bash + $ tree myproject + myproject/ + β”œβ”€β”€ pyproject.toml + └── hello.py + └── README.md + └── .python-version + └── .gitignore + ``` + After this, I recommend doing `uv run hello.py` so the environment is created in `.venv`. Note that every time you do `uv run`, if there are dependencies that are not installed in the virtual environment, they will be automatically installed. See all details in [the documentation](https://docs.astral.sh/uv/concepts/projects/init/#applications). + +#### Dependency Management + Add or remove packages with `uv add` and `uv remove`, which automatically update the `uv.lock` file for reproducible builds: + ```bash + uv add pandas # Installs pandas and its dependencies + uv remove requests # Removes a package and its unused dependencies + uv add "httpx>=0.20" # You can specify package versions with constraints + uv add "httpx @ git+https://github.com/encode/httpx" # Install from git + uv add ../httpx # Install from local directory + ``` + To update a dependency, use also `uv add`. + + +#### Environment Synchronization + Use `uv sync` to ensure the virtual environment matches the lockfile: + ```bash + uv sync # Creates/updates .venv and resolves dependencies + ``` + +#### Updating the Python version + +Simply go to the `.python-version` file and change the version number. Then, when you do `uv run`, it will automatically update the virtual environment and resolve dependencies. + +#### A real example + +This is a real example of `pyproject.toml` file for one of my personal projects: +```toml +[project] +name = "devs-scraper" +version = "0.1.0" +description = """ + A scraper for developments +""" +authors = [ + { name = "marctorsoc" } +] +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "langchain>=0.3.15", + "langchain-ollama>=0.2.2", + "langfuse>=2.57.11", + "langgraph>=0.2.65", + "ollama>=0.4.6", + "python-dotenv>=1.0.1", + ... + "nbconvert>=7.16.6", +] + +[tool.uv.sources] +crawl4ai = { git = "https://github.com/marctorsoc/crawl4ai" } +deboiler = { git = "https://github.com/marctorsoc/deboiler" } + +[project.scripts] +scrape = "scrape:main" + +[build-system] +requires = ["setuptools>=64", "wheel", "cython"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["devs_scraper"] + +[tool.setuptools.packages.find] +exclude = ["*.tests", "*.tests.*", "tests.*", "tests"] +``` + +Notes: +* Please see docs [here](https://docs.astral.sh/uv/concepts/projects/config/#build-systems) about configuring the build system. In this case, I chose `setuptools` as the backend. Note that if a build system is not defined, `uv` won't install the project itself, just the dependencies. +* See how to declare dependencies that are not in Pypi, e.g. in Github. I made a fork of each repository to be able to share the code with others. However, you can also use as source a local package and add it like explained above. That will create a new source, e.g. `deboiler = { path = "../deboiler" }`. +* You probably don't need both the section `[tool.setuptools]` and `[tool.setuptools.packages.find]`. Just use one of them to locate the package to be installed. + +### Advanced Workflows +- **Jupyter from VSCode Integration**: While there is a [guide](https://docs.astral.sh/uv/guides/integration/jupyter/#using-jupyter-from-vs-code) on how to do this, I'd recommed simply make `.venv/bin/python` the kernel for the notebook. You don't really need to do anything, just click on the proper buttons 😁. +- **Docker & FastAPI support**: Check the official guides: + - [FastAPI Integration](https://docs.astral.sh/uv/guides/integration/fastapi/) + - [Docker Integration](https://docs.astral.sh/uv/guides/integration/docker/) + + +## **Warp**: The AI-Powered Terminal for Modern Developers + +Warp reimagines the terminal with features that boost productivity and reduce friction. + +### A little bit of my background + +When I started using terminals, I quickly figured I could not continue with the default one. Quickly I moved into the [Terminator](https://gnome-terminator.org/) and later on to [iterm2](https://iterm2.com/). They are both great, but the era of AI came. + +### Key Features +1. **Agent Mode (AI Integration)** + - **Error Debugging**: Get explanations and fixes for errors directly in the terminal. + - **Code Generation**: Generate scripts or commands using natural language (e.g., "Give me the command to check what's on port 8089"). + +2. **IDE-Like Editing** + Edit commands directly in the terminal with syntax highlighting, multi-line support, easy copy-paste, and more. + +2. **Smart Autocompletion** + Context-aware suggestions for commands, flags, and file paths, powered by your command history and system context. + +4. **Typo Correction** + Warp detects typos (e.g., `got commit` β†’ suggests `git commit`) and offers fixes. + +3. **Split Panes** + Run multiple commands simultaneously in split views, ideal for monitoring logs or parallel workflows. + + +## Conlusion + +Today, I've shared two powerful tools that I use every day: + +- **uv** excels in Python workflows with speed and reproducibility. +- **Warp** transforms terminal usage into an AI-enhanced experience. + +Both tools prioritize developer ergonomics, making them indispensable for modern coding. Explore their full potential through their official documentation and community resources! diff --git a/src/content/posts/2025-02-08-data-science-on-remote.md b/src/content/posts/2025-02-08-data-science-on-remote.md new file mode 100644 index 0000000..7888508 --- /dev/null +++ b/src/content/posts/2025-02-08-data-science-on-remote.md @@ -0,0 +1,128 @@ +--- +title: Data Science on a Remote Machine +date: '2025-02-08 00:00:00 +0000' +permalink: /posts/data-science-in-remote/ +categories: +- Dev tools +isPinned: false +--- + +In this post, I'll share some practical tips and tricks for using a remote machine for data science, covering first the why, then connection, tmux and more. + +*Disclaimer: all info written in this post assumes a Mac as your local, and Ubuntu as remote. Most of it should work also with other combinations of these two.* + +## Why Work on a Remote Machine? + +There are several reasons to work on a remote machine, including: + +- **Resource Limitations:** Running computationally heavy tasks that require more CPU, memory, or storage than your local machine can handle. +- **Accessibility:** Accessing your environment from different locations when you're away from your primary computer. +- **Collaboration:** Sharing a centralized machine with other team members for easier data and environment access. +- **Security:** Using secure remote environments for sensitive data that shouldn't reside on a local device. +- **Scalability:** Leveraging cloud-based resources that can be scaled up or down depending on workload needs. + +## Accessing a Remote Machine + +To access a remote server, you can use: + +```bash +ssh username@ip_address +``` + +This requires entering your password, but you can set up passwordless access using SSH keys: + +1. Generate an SSH key pair locally: + ```bash + ssh-keygen -t rsa + ``` +2. Create the `.ssh` directory on the remote machine: + ```bash + ssh username@ip_address + mkdir -p .ssh + ``` +3. Copy the public key to the remote machine: + ```bash + cat .ssh/id_rsa.pub | ssh username@ip_address 'cat >> .ssh/authorized_keys' + ``` + +## Simplifying SSH with Configuration + +Instead of typing full SSH commands, configure shortcuts in `~/.ssh/config`: + +```bash +Host marc + HostName ip_address + User username + IdentityFile ~/.ssh/id_rsa +``` + +Now you can simply use: + +```bash +ssh marc +``` + +and the same for any other command where you would write `username@ip_address`. You'll see some examples below. + +## File Transfer + +### Using `scp` + +Transfer files between your local machine and the remote server. + +- From remote to local: + ```bash + scp marc:~/titanic.log . + ``` +- From local to remote: + ```bash + scp -r ~/experiments/results marc:~/results/ + ``` + +### Using `rsync` + +Synchronize directories efficiently with `rsync`. The following command synchronizes files from the remote machine to the local directory, skipping files that already exist: + +```bash +rsync -arv --ignore-existing --progress marc:/home/marc/results experiments/results/ +``` + +## Running Shells in the Background + +Running long processes remotely requires keeping them active even if the connection drops. While `screen` is an option, I prefer using `tmux`. Here a cheatsheet of `tmux` commands: + +- List sessions: + ```bash + tmux ls + ``` +- Create a new session: + ```bash + tmux new -s name + ``` +- Attach to an existing session: + ```bash + tmux attach -t name + ``` +- Detach from a session (`Control + b`, then `d`) +- Kill all sessions: + ```bash + killall tmux + ``` +- Scrolling: Enter scroll mode with `Control + b` followed by `[`. Navigate using arrow keys. To exit scroll mode, press `q`. + +## VS Code's Remote Development: A Game Changer + +If you checked my old posts, you'll see I was big proponent of Pycharm over VSCode. However, I used to run notebooks on a Jupyter server, and then connected via browser after setting up a tunnel. You can check [here](/posts/jupyter-lab-on-a-remote-machine/) my old config if still useful at all. I really liked Pycharm's Git integration, especially for merges and comparing branches. And, to be honest, I still think it's a better inferface than the one in VSCode. At least with all extensions I tried. + +But at some point I found VSCode's Remotes feature, and it really became a game changer for me. VSCode supports: +- Opening notebooks directly in the IDE, which means you get all the advantages of being an IDE (code and imports completion, error highlighting, search, debugging, etc.) +- Running notebooks on the remote machine, with the same experience as if it was running locally +- LLM-powered autocomplete within the notebook itself! +- A table of contents (called Outline) to navigate the notebook. + + + ## Conclusion + +In this post, I shared some good practices for working with a remote machine, especially for data science projects. With these tools and techniques, you'll be more efficient and productive when working remotely. + +As always, any recommendation, suggestion or improvement, please let me know. Thanks for reading! diff --git a/src/content/posts/2025-02-13-working-with-notebooks.md b/src/content/posts/2025-02-13-working-with-notebooks.md new file mode 100644 index 0000000..882ef20 --- /dev/null +++ b/src/content/posts/2025-02-13-working-with-notebooks.md @@ -0,0 +1,443 @@ +--- +title: Working with notebooks +date: '2025-02-13 00:00:00 +0000' +permalink: /posts/working-with-notebooks +categories: +- Dev tools +isPinned: true +--- + +In data science, experimentation is key. You often need to explore datasets, visualize results, tweak models, and iterate quickly. **Jupyter notebooks** provide an ideal environment for this workflow: + +* πŸ“ Interactive execution: Run code in small, manageable chunks instead of executing entire scripts. +* πŸ“Š Rich outputs: Easily display tables, charts, and even interactive visualizations inline. +* πŸ”„ Reproducibility: Keep code, results, and documentation together in a structured way. + +Because of these benefits, Jupyter notebooks have become a go-to tool for data scientists. + +## Working with Jupyter + +For years, I used to work first in the classic Jupyter Notebooks, and later on with JupyterLab. But still, I felt myself +transitioning from prototyping in Jupyter on the browser, to writing *serious code* in my IDE (at the time, +Pycharm). When working remotely, I had my system where I would have a service in the EC2 instance, +create a tunnel, and access via browser as if the Jupyter server was local. See [here](/posts/jupyter-lab-on-a-remote-machine/) for details of my old config. + + +## VSCode: A Better Integrated Experience + +But while working at Globality, I figured that not having a good way to edit Python files (not notebooks) with the advantages of an IDE was a big impairment of my workflow. And here's where I embraced VSCode. I was very reluctant to VSCode at first, because for me their Git integration was not as good as Pycharm. But eventually the many advantages surpassed this little issue (which I still miss, to be honest). The main reasons for my move to VSCode were: + +- **Integrated Development Environment (IDE)**: VSCode has an excellent IDE with features like code completion, imports, navigation to sources, and syntax highlighting directly in the editor itself, for notebooks. So you can edit Python scripts and notebooks within the same app! +- **Remote Development**: VSCode allows easily to connect to remote via ssh, and edit files with the same experience you would have if the files were local. Same for interaction with remote notebooks. +- **Easy navigation**: I used to employ this [extension](https://github.com/jupyterlab/jupyterlab-toc) to help me navigate through my notebooks via the markdown headers. In VSCode, this is already integrated in the IDE (called `outline`). + +### Autocomplete + +With the recent advances in AI, now we can run a small LLM (yes, sounds like a contradiction a small large language model) in our laptops. This is great, because it means we can have autocompletions **for free** and **without requiring an internet connection**. It even has privacy implications. + +In my case, I have an M3 and I use [ollama](https://ollama.com/library/qwen2.5-coder:1.5b) and [continue](https://docs.continue.dev/customize/model-providers/ollama#autocomplete-model), an extension for VSCode. This is my config +```json +"tabAutocompleteModel": { + "title": "Qwen2.5-Coder 1.5B", + "model": "qwen2.5-coder:1.5b-base", + "provider": "ollama" + }, +``` +but feel free to try another model you like. This was good enough to me, and extremely fast. + +By the way, if getting issues with the extension, try `ollama ls` and make sure the name in `model` matches on the models you have downloaded. This was an issue for me because in one laptop I had `qwen2.5-coder:1.5b-base` and in the other one **the same model** was called `qwen2.5-coder:1.5b` πŸ€·β€β™‚οΈ + + +### Shortcuts + +I love automating things. That's probably the main reason why I'm working on AI. And shortcuts are definitely one of my favorite things. These are the shortcuts I strongly recommend you to configure in VSCode, together with my current key stroke: + +#### General + +- `Cmd + D`: Duplicate line +- `Cmd + Shift + D`: Remove current line +- `` Ctrl + ` ``: Move focus from editor/terminal to terminal/editor +- `Ctrl + Shift + F`: Find in all files +- `Cmd + Shift + F`: Find file by name +- `Cmd + Y`: Reveal definition of current symbol/function +- `Shift, then Shift`: Toggle Sidebar visibility +- `Cmd + Shift + U`: Transform to upper case +- `Cmd + Shift + Ctrl + U`: Transform to upper case +- `Cmd + Alt + Z`: Navigate Back +- `Alt + Shift + F`: Format document (and I use [globality-black](https://github.com/marctorsoc/globality-black) for this) +- `Alt + Down`: Creates another cursor for the line below. This is nice when you want to edit multiple lines at the same time. To exit this mode, remember to press `Esc`. + +To be honest, I don't remember which of these come already configured in VSCode, but these are the top ones that came to mind. If you have any other suggestions, or there's one that you cannot find how to configure in VSCode, please let me know. + +#### Notebooks + +Here the only shortcut that I configured myself was `Cmd + Shift + 0, then 0`, to restart the notebook kernel. This is very useful to me. But everything else I left as is. You can check the official shortcuts e.g. [here](https://code.visualstudio.com/docs/datascience/jupyter-notebooks). + + +### VSCode struggles with long outputs + +Sometimes, you'll notice that when displaying a long output in a notebook in VSCode, it will slow down, or simply force you restart the kernel. This happened to me recently. In my project, I had a structure like this: +```python +@dataclass +class Page: + url: str + text: str + + parent: Optional["Page"] = None + children: list["Page"] = field(default_factory=list) +``` +that represent a page in my website. Note that is a recursive data structure, since every page has pointers to their parent and children. When showing a single in a notebook, VSCode will try to show its parent and children, which in turn have parents and children themselves, etc. One solution I found was to modify the default `__repr__` and `__repr_html__` methods of the class: +```python +def _repr_html_(self) -> str: + """ + Used when showing the Page directly on the notebook + """ + + # Trim the text to a reasonable length + trimmed_text = ( + (self.text[:200] + "...") + if len(self.text) > 200 + else self.text + ) + + # Get the parent URL (if exists) + parent_url = self.parent.url if self.parent else "None" + + # Get the URLs of the children (max 5, with "..." if more) + children_urls = [child.url for child in self.children] + if len(children_urls) > 5: + children_urls = children_urls[:5] + ["..."] + children_html = ( + "
      " + "".join(f"
    • {url}
    • " for url in children_urls) + "
    " + ) + extras_html = json.dumps(self.extras, indent=4) + + # Create an HTML representation + html = f""" +
    + URL: {self.url}
    + Text: {trimmed_text}
    + Parent URL: {parent_url}
    + Children: {children_html if children_urls else "None"} +
    + """ + return html +``` +and +```python +def __repr__(self) -> str: + """ + Used when showing the Page within e.g. a list on the notebook + (list calls _repr_html_, but for each element calls __repr__) + """ + + # Trim the text to a reasonable length + trimmed_text = ( + (self.text[:100].replace("\n", " ") + "...") + if len(self.text) > 100 + else self.text + ) + + # Get the parent URL (if exists) + parent_url = self.parent.url if self.parent else "None" + + # Get the URLs of the children (max 5, with "..." if more) + children_urls = [child.url for child in self.children] + if len(children_urls) > 5: + children_urls = children_urls[:5] + ["..."] + if children_urls: + children_str = "\n- " + "\n- ".join(children_urls) + else: + children_str = "None" + + # Create the representation + output = f""" + * URL: {self.url} + * Text: {trimmed_text} + * Parent URL: {parent_url} + * Children: {children_str} + """ + return output +``` + +This shows representations like this: +
    + +
    + +### New from 2025 + +#### Inline values + +[This](https://code.visualstudio.com/updates/v1_97#_inline-values-upon-cell-execution) is something I found recently. Are you still writing cells like this? +```python +result = long_function(params) +result +``` +so you can visualize the result? Now, by turning on +```json +"notebook.inlineValues": true, +``` +in your VSCode, you can see the values directly. +
    + +
    + + +#### Verbose time execution + +You might have noticed that after the cell is executed, we can see within the cell status bar some execution time information. This is a setting I missed and [requested](https://github.com/microsoft/vscode-jupyter/issues/13338#issuecomment-1645393350) from the very beginning of adopting VSCode, and was partially released [here](https://github.com/microsoft/vscode/issues/168792#issuecomment-2512350627). The docs for this are [here](https://code.visualstudio.com/updates/v1_96#_cell-execution-time-verbosity). TLDR: +```json +"notebook.cellExecutionTimeVerbosity": "verbose", +``` + +But, I'm still missing one of the benefits I was getting in Jupyter with [this extension](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/nbextensions/execute_time/readme.html). There, this information would be stored within the notebook metadata, which had two advantages: +1. Whe one restarts VScode, it's really useful to understand how long a particular cell took to run. But now all such information is lost. +2. When transforming the notebook into a HTML report (e.g. with [nbconvert](https://nbconvert.readthedocs.io/en/latest/)), I'd like to have this info there too. I used to have a template that would leverage this, and dump it to the HTML too. + +For this reason, I opened this [issue](https://github.com/microsoft/vscode/issues/240405). If you read this, please support me in asking for this feature 😊 + +#### Run cells in section + +If you are like me, and organize your notebook with markdown headers, this allows you to easily navigate via the `Outline` section on the sidebar. But now, there's another big plus! As explained [here](https://code.visualstudio.com/updates/v1_96#_run-cells-in-section-for-markdown), now we can run entire "section" of cells defined by a markdown header. It is as easy as simple as right-click and select: +
    + +
    + + +## Tools related to notebooks + +### jupytext + +*Have you always wished Jupyter notebooks were plain text documents? Wished you could edit them in your favorite IDE? And get clear and meaningful diffs when doing version control? Then, Jupytext may well be the tool you’re looking for!* + +This is the elevator speech of [jupytext](https://jupytext.readthedocs.io/en/latest/), and to be honest, it is quite convincing! However, things have changed a bit since it was first released. Now, VSCode [supports](https://code.visualstudio.com/docs/datascience/jupyter-notebooks#_custom-notebook-diffing) diffing Jupyter notebooks quite nicely. My understanding is that this is a relatively new feature, because I did not see if before. But I could not find its release notes. + +In any case, you probably don't want to add to version control your notebook files: +1. Because they contain the outputs, and that might be heavy information +2. Because the outputs might be heavy +3. Because when looking at it online, it's just simpler to look at a plain text `.py` file. + +Whichever is your reason, these are the two basic commands to work with Jupytext: +```bash +jupytext --to py .ipynb +jupytext --to ipynb .py +``` +and I'll defer you to the [docs](https://jupytext.readthedocs.io/en/latest/) for more. + +#### Missing features in VScode + +This yet another feature that I'm missing from the transition from Jupyter to VSCode. +In Jupyter, one has the ability to [pair](https://jupytext.readthedocs.io/en/latest/paired-notebooks.html) a notebook with its python file. It works as follows: + +* When a paired notebook is opened, the inputs are loaded from the most recent file in the pair, and the outputs are loaded from the `.ipynb` file. + +* When a paired notebook is saved, the corresponding python file is updated, as it would with a `jupytext --to py` run. + +To the best of my knowledge, this feature is not yet (2025) available in VSCode. So I still find myself running the CLI for the "basic behavior". There's this long standing [issue](https://github.com/microsoft/vscode-jupyter/issues/1240) if you want to track this. + +### logger + +Due to the **inability of persisting execution times** in VSCode notebooks, I use a simple Python logger when displaying messages in my notebooks. This is not perfect but it's better than nothing. It also provides a way to time the steps of a cell execution. The config is as simple as this: +```python +logging.basicConfig(format="%(asctime)s - %(message)s") +log = logging.getLogger("lang-examples") +log.setLevel(logging.INFO) +log.info("Starting ...") +``` +showing e.g: +``` +2024-11-18 18:33:21,544 - Starting ... +``` + +Note that if you run this in the notebook, then you can **retrieve the same logger** within your Python files with +```python +log = logging.getLogger("lang-examples") +``` +and don't need to configure the logger again. Another great, but sometimes unknown feature of loggers is that you can set their level (as done above). So a common practice when debugging would be to wrap the line in your notebook like this: +```python +log.setLevel(logging.DEBUG) +run_long_function_to_debug() +log.setLevel(logging.INFO) +``` + +### autoreload + +If you are working on a Jupyter Notebook, then it is very convenient to use the `autoreload` extension. It will automatically reload all modules that have been imported in your notebook when they change (e.g. if you edit them). This can be done by running: +```python +%load_ext autoreload +%autoreload 2 +``` +But there's a small trick some people don't know. If you have constants defined in a Python file, **they won't be reloaded**. This happened to me when working with LLMs, where you might define prompt templates as constants and import when creating a chain. In order to reload the prompt without restarting the kernel, my hack was to wrap the constant in a function that returns it: +```python +def get_prompt_template(): + return """ + You are a helpful assistant... + """ +``` + +## Working with DataFrames in notebooks + + +### DataFrames side by side + +```python +import pandas as pd +from IPython.display import display_html + +def display_side_by_side(*dfs: pd.DataFrame) -> None: + """Displays multiple DataFrames side by side in a Jupyter Notebook with top alignment.""" + html_str = "".join(df.to_html() for df in dfs) + styled_html = html_str.replace(" None: + """Displays a list of DataFrames side by side, limiting to max_columns per row.""" + for start_idx in range(0, len(dfs), max_columns): + display_side_by_side(*dfs[start_idx:start_idx + max_columns]) +``` +and here an example: +
    + +
    + +### Full display + +We often want to see the a DataFrame with full column width. This method allows to do this, and show all rows as well if desired: + +```python +def full_display(df, max_rows=None, max_colwidth=None): + with pd.option_context( + "display.max_rows", + max_rows, + "display.max_colwidth", + max_colwidth, + ): + display(df) +``` + +### Display foldable + +In order to show multiple dataframes after a cell, we can show them side by side as explained above, or we can show them one by one, and allow the user to fold / unfold each of them. This can be achieved with the code [here](https://github.com/marctorsoc/lang-examples/blob/main/lang-examples-common/lang_examples_common/utils/display_utils.py#L35). See an example: + +
    + +
    + + +### Display foldable for nested structures + +Imagine you have an object like this +```python +movies = [ + { + "Title": "Inception", + "Year": 2010, + "Genres": ["Action", "Sci-Fi", "Thriller"], + "Ratings": {"IMDB": 8.8, "Rotten Tomatoes": "87%"}, + "Cast": [ + {"Actor": "Leonardo DiCaprio", "Role": "Dom Cobb"}, + {"Actor": "Joseph Gordon-Levitt", "Role": "Arthur"}, + ], + }, + { + "Title": "The Matrix", + "Year": 1999, + "Genres": ["Action", "Sci-Fi"], + "Ratings": {"IMDB": 8.7, "Rotten Tomatoes": "83%"}, + "Cast": [ + {"Actor": "Keanu Reeves", "Role": "Neo"}, + {"Actor": "Laurence Fishburne", "Role": "Morpheus"}, + ], + }, +] +``` +In the following image, you can see how it would look if we just display: +
    + +
    + +and now if we use the power of `render_nested=True`: +
    + +
    + +### Pipes + +We often want to do some filtering on our data, and log the length of the dataframe at each step. This is where we can make use of pipes. I have two pipes that I use often: +```python +def log_len(df, message=""): + print(f"{message}: {len(df)}") + return df + + +def log_df(df): + display(df) + return df +``` +and here is an example: + +```python +df = pd.DataFrame( + [ + {"Title": "Superbad", "Year": 2007, "Rating": 7.6, "genre": "Comedy"}, + {"Title": "Step Brothers", "Year": 2008, "Rating": 6.9, "genre": "Comedy"}, + {"Title": "The Big Lebowski", "Year": 1998, "Rating": 8.1, "genre": "Comedy"}, + {"Title": "The Shining", "Year": 1980, "Rating": 8.4, "genre": "Horror"}, + {"Title": "Get Out", "Year": 2017, "Rating": 7.8, "genre": "Horror"}, + {"Title": "It", "Year": 2017, "Rating": 7.3, "genre": "Horror"}, + {"Title": "Interstellar", "Year": 2014, "Rating": 8.7, "genre": "Sci-Fi"}, + {"Title": "The Martian", "Year": 2015, "Rating": 7.5, "genre": "Sci-Fi"}, + {"Title": "Inception", "Year": 2010, "Rating": 8.8, "genre": "Sci-Fi"}, + {"Title": "Blade Runner", "Year": 1982, "Rating": 9.3, "genre": "Sci-Fi"}, + {"Title": "Blade Runner 2049", "Year": 2017, "Rating": 8.0, "genre": "Sci-Fi"}, + ] +) +filtered_movies = ( + movies.pipe(log_len, "Initial dataset") + .query("genre in ['Comedy', 'Sci-Fi']") + .pipe(log_len, "After filtering by Comedy or Sci-Fi") + .query("year > 2000") + .pipe(log_len, "After filtering movies older than 2000") + .pipe(log_df) +) +``` +which shows: +
    + +
    + +### Visualizing dfs with multi-line texts + +If you use `full_display` when a cell in a dataframe contains a text with multiple lines, you will see the newline characters as part of the text. Here is a solution to show the text nicely: + +```python +data = { + 'title': ['Inception', 'The Matrix', 'Interstellar'], + 'year': [2010, 1999, 2014], + 'rating': [8.8, 8.7, 8.6], + 'synopsis': [ + "A thief who steals corporate secrets\nby entering the subconscious of his targets\nfaces his greatest challenge yet.", + "A computer hacker learns from mysterious rebels\nabout the true nature of his reality\nand his role in the war against its controllers.", + "A team of explorers travel through a wormhole\nin space in an attempt to ensure humanity's survival." + ] +} + +df = pd.DataFrame(data) + +# Full display of the dataframe +full_display(df) + +# Display the DataFrame with HTML rendering +display( + HTML( + df + .assign(synopsis=lambda df: df.synopsis.str.replace("\n", "
    ")) + .to_html(escape=False, justify="left") + ) +) +``` +which produces: +
    + +
    diff --git a/src/pages/Blog.tsx b/src/pages/Blog.tsx index 5507dde..4d2605f 100644 --- a/src/pages/Blog.tsx +++ b/src/pages/Blog.tsx @@ -61,8 +61,12 @@ export default function Blog() { }, []); const filteredPosts = React.useMemo(() => { - if (selectedCategories.length === 0) return posts; - return posts.filter(post => + // First filter out archived posts + const activePosts = posts.filter(post => !post.isArchived); + + // Then apply category filter if any + if (selectedCategories.length === 0) return activePosts; + return activePosts.filter(post => post.categories.some(category => selectedCategories.includes(category)) ); }, [posts, selectedCategories]); @@ -80,13 +84,15 @@ export default function Blog() { ? prev.filter(c => c !== category) : [...prev, category]; - // Update URL parameters - if (newCategories.length === 1) { - setSearchParams({ category: newCategories[0] }); - } else if (newCategories.length === 0) { - setSearchParams({}); - } - + // Use a separate effect for updating URL params + setTimeout(() => { + const params = new URLSearchParams(); + if (newCategories.length > 0) { + params.set('category', newCategories.sort().join('&')); + } + setSearchParams(params); + }, 0); + return newCategories; }); }; diff --git a/src/pages/Home.tsx b/src/pages/Home.tsx index c0a321f..1b739b3 100644 --- a/src/pages/Home.tsx +++ b/src/pages/Home.tsx @@ -27,7 +27,7 @@ export default function Home() { }, []); const pinnedPosts = posts?.filter(post => post.isPinned) || []; - const recentPosts = posts?.slice(0, 6) || []; + const recentPosts = posts?.filter(post => !post.isArchived).slice(0, 6) || []; return (
    diff --git a/src/pages/Resources.tsx b/src/pages/Resources.tsx index 77eeda4..c82c569 100644 --- a/src/pages/Resources.tsx +++ b/src/pages/Resources.tsx @@ -1,4 +1,3 @@ -import React from 'react'; import ReactMarkdown from 'react-markdown'; import remarkGfm from 'remark-gfm'; import rehypeRaw from 'rehype-raw'; diff --git a/src/types/Post.ts b/src/types/Post.ts index f338f62..1724e4f 100644 --- a/src/types/Post.ts +++ b/src/types/Post.ts @@ -1,3 +1,9 @@ +export interface TOCHeader { + level: number; + text: string; + id: string; +} + export interface Post { content: string; date: string; @@ -6,6 +12,8 @@ export interface Post { title: string; categories: string[]; isPinned?: boolean; + isArchived?: boolean; heroImage?: string; heroImageWidth?: string; + toc?: TOCHeader[]; } \ No newline at end of file diff --git a/src/utils/PostLoader.ts b/src/utils/PostLoader.ts index c2cbdcb..0f1195d 100644 --- a/src/utils/PostLoader.ts +++ b/src/utils/PostLoader.ts @@ -1,8 +1,11 @@ import frontMatter from 'front-matter'; import { Post } from '../types/Post'; +import { TOC_MIN_DEPTH, TOC_MAX_DEPTH } from './constants'; +import { extractHeaders } from './tocUtils'; const DEBUG = import.meta.env.DEV; + function validatePostAttributes(attributes: any): attributes is Post { if (!attributes) { console.error('❌ Post attributes are undefined'); @@ -39,7 +42,7 @@ function validatePostAttributes(attributes: any): attributes is Post { return true; } -async function importAll(): Promise { +async function importAll(minDepth: number = TOC_MIN_DEPTH, maxDepth: number = TOC_MAX_DEPTH): Promise { if (DEBUG) console.log('πŸ”„ Starting to load posts...'); try { @@ -74,7 +77,8 @@ async function importAll(): Promise { } const filename = filepath.split('/').pop() || ''; - + const headers = extractHeaders(body, minDepth, maxDepth); + return { ...attributes, content: body, @@ -82,7 +86,8 @@ async function importAll(): Promise { permalink: attributes.permalink || filepath .replace('/src/content/posts/', '') .replace('.md', ''), - filename + filename, + toc: headers }; } catch (error) { console.error(`❌ Error processing ${filepath}:`, error); @@ -91,19 +96,56 @@ async function importAll(): Promise { }) ); - const validPosts = posts.filter((post): post is Post => post !== null); + const validPosts = posts + .filter((post): post is NonNullable => post !== null) + // Don't use type predicate here, just validate the date + .filter(post => { + try { + const timestamp = new Date(post.date).getTime(); + if (isNaN(timestamp)) { + console.error(`❌ Invalid date format in post: ${post.title}`); + return false; + } + return true; + } catch { + console.error(`❌ Invalid date format in post: ${post.title}`); + return false; + } + }) + // Type assertion in the map function + .map((post): Post => ({ + ...post, + date: new Date(post.date).toISOString() + })); if (DEBUG) console.log(`βœ… Successfully loaded ${validPosts.length} posts`); if (validPosts.length === 0) { console.warn('⚠️ No valid posts were loaded'); + return []; } - return validPosts.sort((a, b) => new Date(b.date).getTime() - new Date(a.date).getTime()); + // Sort posts with safe date comparison + return validPosts.sort((a, b) => { + try { + const dateA = new Date(a.date).getTime(); + const dateB = new Date(b.date).getTime(); + + if (isNaN(dateA) || isNaN(dateB)) { + console.warn('⚠️ Invalid date comparison:', { a: a.date, b: b.date }); + return 0; + } + + return dateB - dateA; + } catch (error) { + console.error('❌ Error comparing dates:', error); + return 0; + } + }); } catch (error) { console.error('❌ Fatal error loading posts:', error); return []; } } -export const getPosts = importAll; \ No newline at end of file +export const getPosts = () => importAll(); \ No newline at end of file diff --git a/src/utils/constants.ts b/src/utils/constants.ts index 7ef9104..ef74dcf 100644 --- a/src/utils/constants.ts +++ b/src/utils/constants.ts @@ -1,4 +1,3 @@ -export const CATEGORIES = ['Academia', 'AI', 'Coding', 'Maths', 'Off-topic', 'Work']; export const POSTS_PER_PAGE = 5; export const categoryColors: { [key: string]: string } = { @@ -6,6 +5,12 @@ export const categoryColors: { [key: string]: string } = { 'Academia': 'bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300', 'Work': 'bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300', 'Maths': 'bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-300', - 'Coding': 'bg-pink-100 text-pink-800 dark:bg-pink-900 dark:text-pink-300', + 'Dev tools': 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300', + 'Coding puzzles': 'bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300', 'AI': 'bg-cyan-100 text-cyan-800 dark:bg-cyan-900 dark:text-cyan-300' }; + +export const CATEGORIES = Object.keys(categoryColors).sort(); + +export const TOC_MIN_DEPTH = 1; +export const TOC_MAX_DEPTH = 4; diff --git a/src/utils/markdownUtils.ts b/src/utils/markdownUtils.ts index 7e8fc40..3634c94 100644 --- a/src/utils/markdownUtils.ts +++ b/src/utils/markdownUtils.ts @@ -1,6 +1,8 @@ import frontMatter from 'front-matter'; import { Post } from '../types/Post'; import { getPosts } from './PostLoader'; +import { TOC_MIN_DEPTH, TOC_MAX_DEPTH } from './constants'; +import { extractHeaders } from './tocUtils'; export async function getAllPosts(): Promise { return getPosts(); @@ -16,7 +18,7 @@ interface PostAttributes { heroImageWidth?: string; } -export async function getPost(slug: string): Promise { +export async function getPost(slug: string, minDepth: number = TOC_MIN_DEPTH, maxDepth: number = TOC_MAX_DEPTH): Promise { const DEBUG = false; try { if (DEBUG) console.log('πŸ” Looking for post with slug:', slug); @@ -58,6 +60,8 @@ export async function getPost(slug: string): Promise { throw new Error('Invalid post format'); } + const headers = extractHeaders(body, minDepth, maxDepth); + // Now TypeScript knows the shape of attributes const post: Post = { title: attributes.title, @@ -69,9 +73,11 @@ export async function getPost(slug: string): Promise { .replace('.md', ''), filename: filepath.split('/').pop() || '', heroImage: attributes.heroImage, - heroImageWidth: attributes.heroImageWidth + heroImageWidth: attributes.heroImageWidth, + toc: headers }; + if (DEBUG) console.log('Post TOC:', post.toc); return post; } catch (error) { console.error('Error loading post:', error); diff --git a/src/utils/textUtils.ts b/src/utils/textUtils.ts new file mode 100644 index 0000000..aefc613 --- /dev/null +++ b/src/utils/textUtils.ts @@ -0,0 +1,20 @@ +export function stripMarkdown(text: string): string { + return text + .replace(/`([^`]+)`/g, '$1') // Remove backticks + .replace(/\*\*([^*]+)\*\*/g, '$1') // Remove bold + .replace(/\*([^*]+)\*/g, '$1') // Remove italics + .replace(/_([^_]+)_/g, '$1'); // Remove underscores +} + +export function generateId(text: string, index?: number): string { + // First strip markdown syntax + const cleanText = stripMarkdown(text); + + // Then generate the ID + const id = cleanText + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, ''); + + return typeof index === 'number' ? `${id}-${index + 1}` : id; +} diff --git a/src/utils/tocUtils.ts b/src/utils/tocUtils.ts new file mode 100644 index 0000000..2b65c3e --- /dev/null +++ b/src/utils/tocUtils.ts @@ -0,0 +1,46 @@ +import { generateId } from './textUtils'; +import { TOC_MIN_DEPTH, TOC_MAX_DEPTH } from './constants'; + +export interface Header { + level: number; + text: string; + id: string; +} + +export function extractHeaders( + content: string, + minDepth: number = TOC_MIN_DEPTH, + maxDepth: number = TOC_MAX_DEPTH +): Header[] { + const lines = content.split('\n'); + const headers: Header[] = []; + let inCodeBlock = false; + const headerIds = new Map(); + + const depthRange = Array.from( + { length: maxDepth - minDepth + 1 }, + (_, i) => '#'.repeat(i + minDepth) + ).join('|'); + const headerRegex = new RegExp(`^(${depthRange})\\s+(.*)$`); + + for (let line of lines) { + if (line.startsWith('```')) { + inCodeBlock = !inCodeBlock; + continue; + } + + if (!inCodeBlock && headerRegex.test(line)) { + const level = line.match(/^#+/)?.[0].length || 0; + const text = line.replace(/^#+\s+/, ''); + + const baseId = generateId(text); + const count = headerIds.get(baseId) || 0; + headerIds.set(baseId, count + 1); + const id = count === 0 ? baseId : `${baseId}-${count}`; + + headers.push({ level, text, id }); + } + } + + return headers; +}