build_a_blog: multiprocessing
This commit is contained in:
parent
23042c2d35
commit
d2b810441d
2 changed files with 95 additions and 4 deletions
11
main.py
11
main.py
|
@ -1,9 +1,11 @@
|
|||
import os
|
||||
import re
|
||||
import glob
|
||||
import html
|
||||
import email
|
||||
import logging
|
||||
import datetime
|
||||
from multiprocessing import Pool, TimeoutError
|
||||
from string import Template
|
||||
|
||||
import markdown
|
||||
|
@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
|
|||
logging.basicConfig(encoding='utf-8', level=logging.INFO)
|
||||
|
||||
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
|
||||
cpu_count = os.cpu_count()
|
||||
|
||||
def convert(text):
|
||||
md.reset()
|
||||
|
@ -52,11 +55,11 @@ def render_posts():
|
|||
files = glob.glob('posts/*.md')
|
||||
logging.info('found post files %s', files)
|
||||
posts = []
|
||||
for fname in files:
|
||||
p = render_post(fname)
|
||||
posts.append(p)
|
||||
logging.info('rendered post: %s', p)
|
||||
logging.info('starting render posts with cpu_count: %d', cpu_count)
|
||||
with Pool(processes=cpu_count) as pool:
|
||||
posts = pool.map(render_post, files)
|
||||
|
||||
logging.info("render_posts result: %s", res)
|
||||
return posts
|
||||
|
||||
def posts_list_html(posts):
|
||||
|
|
|
@ -763,6 +763,7 @@ rm -f ./posts/*__bench*
|
|||
```
|
||||
|
||||
```shell
|
||||
# Run on a 16 core AMD Ryzen 7 7840U
|
||||
❯ ./bench.sh 100
|
||||
INFO: removing old __bench files
|
||||
INFO: number of *.md files 102
|
||||
|
@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands.
|
|||
|
||||
Will be a fun future idea to try to solve.
|
||||
|
||||
### 2024-06-19, gotta go fast?
|
||||
|
||||
The critical part of the program that gets slower with more files is when each file is rendered to markdown.
|
||||
|
||||
I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here.
|
||||
|
||||
Luckily `render_posts()` is already in a great format for using `Pool.map`
|
||||
|
||||
* 1 array of input file names
|
||||
* Call `render_post` with 1 file name as an argument
|
||||
* Result is collected in a list.
|
||||
|
||||
So here is the diff to make that happen:
|
||||
|
||||
```diff
|
||||
diff --git a/main.py b/main.py
|
||||
index 52ce57b..91d852b 100644
|
||||
--- a/main.py
|
||||
+++ b/main.py
|
||||
@@ -1,9 +1,11 @@
|
||||
+import os
|
||||
import re
|
||||
import glob
|
||||
import html
|
||||
import email
|
||||
import logging
|
||||
import datetime
|
||||
+from multiprocessing import Pool, TimeoutError
|
||||
from string import Template
|
||||
|
||||
import markdown
|
||||
@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
|
||||
logging.basicConfig(encoding='utf-8', level=logging.INFO)
|
||||
|
||||
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
|
||||
+cpu_count = os.cpu_count()
|
||||
|
||||
def convert(text):
|
||||
md.reset()
|
||||
@@ -52,11 +55,11 @@ def render_posts():
|
||||
files = glob.glob('posts/*.md')
|
||||
logging.info('found post files %s', files)
|
||||
posts = []
|
||||
- for fname in files:
|
||||
- p = render_post(fname)
|
||||
- posts.append(p)
|
||||
- logging.info('rendered post: %s', p)
|
||||
+ logging.info('starting render posts with cpu_count: %d', cpu_count)
|
||||
+ with Pool(processes=cpu_count) as pool:
|
||||
+ posts = pool.map(render_post, files)
|
||||
|
||||
+ logging.info("render_posts result: %s", res)
|
||||
return posts
|
||||
|
||||
def posts_list_html(posts):
|
||||
```
|
||||
|
||||
And re-run the benchmarks:
|
||||
|
||||
```shell
|
||||
# Run on a 16 core AMD Ryzen 7 7840U
|
||||
❯ ./bench.sh 100
|
||||
INFO: removing old __bench files
|
||||
INFO: number of *.md files 102
|
||||
INFO: number of *.html files 2
|
||||
INFO: running
|
||||
real 0.21
|
||||
user 1.64
|
||||
sys 0.14
|
||||
INFO: number of *.html files 102
|
||||
INFO: cleanup __bench files
|
||||
|
||||
❯ ./bench.sh 1000
|
||||
INFO: removing old __bench files
|
||||
INFO: number of *.md files 1002
|
||||
INFO: number of *.html files 2
|
||||
INFO: running
|
||||
real 1.12
|
||||
user 15.69
|
||||
sys 0.24
|
||||
INFO: number of *.html files 1002
|
||||
INFO: cleanup __bench files
|
||||
```
|
||||
|
||||
So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) .
|
||||
|
||||
🎉 that's cool!
|
||||
|
||||
[1]: https://crystal-lang.org/
|
||||
[2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0
|
||||
|
|
Loading…
Reference in a new issue