build_a_blog: multiprocessing

This commit is contained in:
Collin Lefeber 2024-06-19 15:26:09 -04:00
parent 23042c2d35
commit d2b810441d
2 changed files with 95 additions and 4 deletions

11
main.py
View file

@ -1,9 +1,11 @@
import os
import re import re
import glob import glob
import html import html
import email import email
import logging import logging
import datetime import datetime
from multiprocessing import Pool, TimeoutError
from string import Template from string import Template
import markdown import markdown
@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
logging.basicConfig(encoding='utf-8', level=logging.INFO) logging.basicConfig(encoding='utf-8', level=logging.INFO)
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)]) md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
cpu_count = os.cpu_count()
def convert(text): def convert(text):
md.reset() md.reset()
@ -52,11 +55,11 @@ def render_posts():
files = glob.glob('posts/*.md') files = glob.glob('posts/*.md')
logging.info('found post files %s', files) logging.info('found post files %s', files)
posts = [] posts = []
for fname in files: logging.info('starting render posts with cpu_count: %d', cpu_count)
p = render_post(fname) with Pool(processes=cpu_count) as pool:
posts.append(p) posts = pool.map(render_post, files)
logging.info('rendered post: %s', p)
logging.info("render_posts result: %s", res)
return posts return posts
def posts_list_html(posts): def posts_list_html(posts):

View file

@ -763,6 +763,7 @@ rm -f ./posts/*__bench*
``` ```
```shell ```shell
# Run on a 16 core AMD Ryzen 7 7840U
./bench.sh 100 ./bench.sh 100
INFO: removing old __bench files INFO: removing old __bench files
INFO: number of *.md files 102 INFO: number of *.md files 102
@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands.
Will be a fun future idea to try to solve. Will be a fun future idea to try to solve.
### 2024-06-19, gotta go fast?
The critical part of the program that gets slower with more files is when each file is rendered to markdown.
I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here.
Luckily `render_posts()` is already in a great format for using `Pool.map`
* 1 array of input file names
* Call `render_post` with 1 file name as an argument
* Result is collected in a list.
So here is the diff to make that happen:
```diff
diff --git a/main.py b/main.py
index 52ce57b..91d852b 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,11 @@
+import os
import re
import glob
import html
import email
import logging
import datetime
+from multiprocessing import Pool, TimeoutError
from string import Template
import markdown
@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
logging.basicConfig(encoding='utf-8', level=logging.INFO)
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
+cpu_count = os.cpu_count()
def convert(text):
md.reset()
@@ -52,11 +55,11 @@ def render_posts():
files = glob.glob('posts/*.md')
logging.info('found post files %s', files)
posts = []
- for fname in files:
- p = render_post(fname)
- posts.append(p)
- logging.info('rendered post: %s', p)
+ logging.info('starting render posts with cpu_count: %d', cpu_count)
+ with Pool(processes=cpu_count) as pool:
+ posts = pool.map(render_post, files)
+ logging.info("render_posts result: %s", res)
return posts
def posts_list_html(posts):
```
And re-run the benchmarks:
```shell
# Run on a 16 core AMD Ryzen 7 7840U
./bench.sh 100
INFO: removing old __bench files
INFO: number of *.md files 102
INFO: number of *.html files 2
INFO: running
real 0.21
user 1.64
sys 0.14
INFO: number of *.html files 102
INFO: cleanup __bench files
./bench.sh 1000
INFO: removing old __bench files
INFO: number of *.md files 1002
INFO: number of *.html files 2
INFO: running
real 1.12
user 15.69
sys 0.24
INFO: number of *.html files 1002
INFO: cleanup __bench files
```
So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) .
🎉 that's cool!
[1]: https://crystal-lang.org/ [1]: https://crystal-lang.org/
[2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0 [2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0