diff --git a/main.py b/main.py index 52ce57b..91d852b 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,11 @@ +import os import re import glob import html import email import logging import datetime +from multiprocessing import Pool, TimeoutError from string import Template import markdown @@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$') logging.basicConfig(encoding='utf-8', level=logging.INFO) md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)]) +cpu_count = os.cpu_count() def convert(text): md.reset() @@ -52,11 +55,11 @@ def render_posts(): files = glob.glob('posts/*.md') logging.info('found post files %s', files) posts = [] - for fname in files: - p = render_post(fname) - posts.append(p) - logging.info('rendered post: %s', p) + logging.info('starting render posts with cpu_count: %d', cpu_count) + with Pool(processes=cpu_count) as pool: + posts = pool.map(render_post, files) + logging.info("render_posts result: %s", res) return posts def posts_list_html(posts): diff --git a/posts/build_a_blog.md b/posts/build_a_blog.md index a9e4929..e62cf7f 100644 --- a/posts/build_a_blog.md +++ b/posts/build_a_blog.md @@ -763,6 +763,7 @@ rm -f ./posts/*__bench* ``` ```shell +# Run on a 16 core AMD Ryzen 7 7840U ❯ ./bench.sh 100 INFO: removing old __bench files INFO: number of *.md files 102 @@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands. Will be a fun future idea to try to solve. +### 2024-06-19, gotta go fast? + +The critical part of the program that gets slower with more files is when each file is rendered to markdown. + +I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here. + +Luckily `render_posts()` is already in a great format for using `Pool.map` + +* 1 array of input file names +* Call `render_post` with 1 file name as an argument +* Result is collected in a list. + +So here is the diff to make that happen: + +```diff +diff --git a/main.py b/main.py +index 52ce57b..91d852b 100644 +--- a/main.py ++++ b/main.py +@@ -1,9 +1,11 @@ ++import os + import re + import glob + import html + import email + import logging + import datetime ++from multiprocessing import Pool, TimeoutError + from string import Template + + import markdown +@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$') + logging.basicConfig(encoding='utf-8', level=logging.INFO) + + md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)]) ++cpu_count = os.cpu_count() + + def convert(text): + md.reset() +@@ -52,11 +55,11 @@ def render_posts(): + files = glob.glob('posts/*.md') + logging.info('found post files %s', files) + posts = [] +- for fname in files: +- p = render_post(fname) +- posts.append(p) +- logging.info('rendered post: %s', p) ++ logging.info('starting render posts with cpu_count: %d', cpu_count) ++ with Pool(processes=cpu_count) as pool: ++ posts = pool.map(render_post, files) + ++ logging.info("render_posts result: %s", res) + return posts + + def posts_list_html(posts): +``` + +And re-run the benchmarks: + +```shell +# Run on a 16 core AMD Ryzen 7 7840U +❯ ./bench.sh 100 +INFO: removing old __bench files +INFO: number of *.md files 102 +INFO: number of *.html files 2 +INFO: running +real 0.21 +user 1.64 +sys 0.14 +INFO: number of *.html files 102 +INFO: cleanup __bench files + +❯ ./bench.sh 1000 +INFO: removing old __bench files +INFO: number of *.md files 1002 +INFO: number of *.html files 2 +INFO: running +real 1.12 +user 15.69 +sys 0.24 +INFO: number of *.html files 1002 +INFO: cleanup __bench files +``` + +So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) . + +🎉 that's cool! [1]: https://crystal-lang.org/ [2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0