build_a_blog: multiprocessing

This commit is contained in:
Collin Lefeber 2024-06-19 15:26:09 -04:00
parent 23042c2d35
commit d2b810441d
2 changed files with 95 additions and 4 deletions

11
main.py
View file

@ -1,9 +1,11 @@
import os
import re
import glob
import html
import email
import logging
import datetime
from multiprocessing import Pool, TimeoutError
from string import Template
import markdown
@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
logging.basicConfig(encoding='utf-8', level=logging.INFO)
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
cpu_count = os.cpu_count()
def convert(text):
md.reset()
@ -52,11 +55,11 @@ def render_posts():
files = glob.glob('posts/*.md')
logging.info('found post files %s', files)
posts = []
for fname in files:
p = render_post(fname)
posts.append(p)
logging.info('rendered post: %s', p)
logging.info('starting render posts with cpu_count: %d', cpu_count)
with Pool(processes=cpu_count) as pool:
posts = pool.map(render_post, files)
logging.info("render_posts result: %s", res)
return posts
def posts_list_html(posts):

View file

@ -763,6 +763,7 @@ rm -f ./posts/*__bench*
```
```shell
# Run on a 16 core AMD Ryzen 7 7840U
./bench.sh 100
INFO: removing old __bench files
INFO: number of *.md files 102
@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands.
Will be a fun future idea to try to solve.
### 2024-06-19, gotta go fast?
The critical part of the program that gets slower with more files is when each file is rendered to markdown.
I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here.
Luckily `render_posts()` is already in a great format for using `Pool.map`
* 1 array of input file names
* Call `render_post` with 1 file name as an argument
* Result is collected in a list.
So here is the diff to make that happen:
```diff
diff --git a/main.py b/main.py
index 52ce57b..91d852b 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,11 @@
+import os
import re
import glob
import html
import email
import logging
import datetime
+from multiprocessing import Pool, TimeoutError
from string import Template
import markdown
@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
logging.basicConfig(encoding='utf-8', level=logging.INFO)
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
+cpu_count = os.cpu_count()
def convert(text):
md.reset()
@@ -52,11 +55,11 @@ def render_posts():
files = glob.glob('posts/*.md')
logging.info('found post files %s', files)
posts = []
- for fname in files:
- p = render_post(fname)
- posts.append(p)
- logging.info('rendered post: %s', p)
+ logging.info('starting render posts with cpu_count: %d', cpu_count)
+ with Pool(processes=cpu_count) as pool:
+ posts = pool.map(render_post, files)
+ logging.info("render_posts result: %s", res)
return posts
def posts_list_html(posts):
```
And re-run the benchmarks:
```shell
# Run on a 16 core AMD Ryzen 7 7840U
./bench.sh 100
INFO: removing old __bench files
INFO: number of *.md files 102
INFO: number of *.html files 2
INFO: running
real 0.21
user 1.64
sys 0.14
INFO: number of *.html files 102
INFO: cleanup __bench files
./bench.sh 1000
INFO: removing old __bench files
INFO: number of *.md files 1002
INFO: number of *.html files 2
INFO: running
real 1.12
user 15.69
sys 0.24
INFO: number of *.html files 1002
INFO: cleanup __bench files
```
So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) .
🎉 that's cool!
[1]: https://crystal-lang.org/
[2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0