build_a_blog: multiprocessing

2024-06-19 15:26:09 -04:00 · 2024-06-19 15:26:09 -04:00 · d2b810441d
commit d2b810441d
parent 23042c2d35
2 changed files with 95 additions and 4 deletions
--- a/main.py
+++ b/main.py
@ -1,9 +1,11 @@
 import os
 import re
 import glob
 import html
 import email
 import logging
 import datetime
 from multiprocessing import Pool, TimeoutError
 from string import Template
 import markdown
@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
 logging.basicConfig(encoding='utf-8', level=logging.INFO)
 md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
 cpu_count = os.cpu_count()
 def convert(text):
 	md.reset()
@ -52,11 +55,11 @@ def render_posts():
 	files = glob.glob('posts/*.md')
 	logging.info('found post files %s', files)
 	posts = []
-	for fname in files:
+	logging.info('starting render posts with cpu_count: %d', cpu_count)
-		p = render_post(fname)
+	with Pool(processes=cpu_count) as pool:
-		posts.append(p)
+		posts = pool.map(render_post, files)
 		logging.info('rendered post: %s', p)
 	logging.info("render_posts result: %s", res)
 	return posts
 def posts_list_html(posts):
--- a/posts/build_a_blog.md
+++ b/posts/build_a_blog.md
@ -763,6 +763,7 @@ rm -f ./posts/*__bench*
 ```
 ```shell
 # Run on a 16 core AMD Ryzen 7 7840U
 ❯ ./bench.sh 100
 INFO: removing old __bench files
 INFO: number of *.md files 102
@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands.
 Will be a fun future idea to try to solve.
 ### 2024-06-19, gotta go fast?
 The critical part of the program that gets slower with more files is when each file is rendered to markdown.
 I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here.
 Luckily `render_posts()` is already in a great format for using `Pool.map`
 * 1 array of input file names
 * Call `render_post` with 1 file name as an argument
 * Result is collected in a list.
 So here is the diff to make that happen:
 ```diff
 diff --git a/main.py b/main.py
 index 52ce57b..91d852b 100644
 --- a/main.py
 +++ b/main.py
@@ -1,9 +1,11 @@
 +import os
 import re
 import glob
 import html
 import email
 import logging
 import datetime
 +from multiprocessing import Pool, TimeoutError
 from string import Template
 import markdown
@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
 logging.basicConfig(encoding='utf-8', level=logging.INFO)
 md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
 +cpu_count = os.cpu_count()
 def convert(text):
 	md.reset()
@@ -52,11 +55,11 @@ def render_posts():
 	files = glob.glob('posts/*.md')
 	logging.info('found post files %s', files)
 	posts = []
 -	for fname in files:
 -		p = render_post(fname)
 -		posts.append(p)
 -		logging.info('rendered post: %s', p)
 +	logging.info('starting render posts with cpu_count: %d', cpu_count)
 +	with Pool(processes=cpu_count) as pool:
 +		posts = pool.map(render_post, files)
 +	logging.info("render_posts result: %s", res)
 	return posts
 def posts_list_html(posts):
 ```
 And re-run the benchmarks:
 ```shell
 # Run on a 16 core AMD Ryzen 7 7840U
 ❯ ./bench.sh 100
 INFO: removing old __bench files
 INFO: number of *.md files 102
 INFO: number of *.html files 2
 INFO: running
 real 0.21
 user 1.64
 sys 0.14
 INFO: number of *.html files 102
 INFO: cleanup __bench files
 ❯ ./bench.sh 1000
 INFO: removing old __bench files
 INFO: number of *.md files 1002
 INFO: number of *.html files 2
 INFO: running
 real 1.12
 user 15.69
 sys 0.24
 INFO: number of *.html files 1002
 INFO: cleanup __bench files
 ```
 So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) .
 🎉 that's cool!
 [1]: https://crystal-lang.org/
 [2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0