build_a_blog: multiprocessing
This commit is contained in:
parent
23042c2d35
commit
d2b810441d
2 changed files with 95 additions and 4 deletions
11
main.py
11
main.py
|
@ -1,9 +1,11 @@
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import glob
|
import glob
|
||||||
import html
|
import html
|
||||||
import email
|
import email
|
||||||
import logging
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
|
from multiprocessing import Pool, TimeoutError
|
||||||
from string import Template
|
from string import Template
|
||||||
|
|
||||||
import markdown
|
import markdown
|
||||||
|
@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
|
||||||
logging.basicConfig(encoding='utf-8', level=logging.INFO)
|
logging.basicConfig(encoding='utf-8', level=logging.INFO)
|
||||||
|
|
||||||
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
|
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
|
||||||
|
cpu_count = os.cpu_count()
|
||||||
|
|
||||||
def convert(text):
|
def convert(text):
|
||||||
md.reset()
|
md.reset()
|
||||||
|
@ -52,11 +55,11 @@ def render_posts():
|
||||||
files = glob.glob('posts/*.md')
|
files = glob.glob('posts/*.md')
|
||||||
logging.info('found post files %s', files)
|
logging.info('found post files %s', files)
|
||||||
posts = []
|
posts = []
|
||||||
for fname in files:
|
logging.info('starting render posts with cpu_count: %d', cpu_count)
|
||||||
p = render_post(fname)
|
with Pool(processes=cpu_count) as pool:
|
||||||
posts.append(p)
|
posts = pool.map(render_post, files)
|
||||||
logging.info('rendered post: %s', p)
|
|
||||||
|
|
||||||
|
logging.info("render_posts result: %s", res)
|
||||||
return posts
|
return posts
|
||||||
|
|
||||||
def posts_list_html(posts):
|
def posts_list_html(posts):
|
||||||
|
|
|
@ -763,6 +763,7 @@ rm -f ./posts/*__bench*
|
||||||
```
|
```
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
# Run on a 16 core AMD Ryzen 7 7840U
|
||||||
❯ ./bench.sh 100
|
❯ ./bench.sh 100
|
||||||
INFO: removing old __bench files
|
INFO: removing old __bench files
|
||||||
INFO: number of *.md files 102
|
INFO: number of *.md files 102
|
||||||
|
@ -790,6 +791,93 @@ So approx 0.8s per 100 posts which starts to get a bit painful in the thousands.
|
||||||
|
|
||||||
Will be a fun future idea to try to solve.
|
Will be a fun future idea to try to solve.
|
||||||
|
|
||||||
|
### 2024-06-19, gotta go fast?
|
||||||
|
|
||||||
|
The critical part of the program that gets slower with more files is when each file is rendered to markdown.
|
||||||
|
|
||||||
|
I'm by no means a python concurrency expert, but after a quick search `multiprocessing.Pool` looks like a really quick win here.
|
||||||
|
|
||||||
|
Luckily `render_posts()` is already in a great format for using `Pool.map`
|
||||||
|
|
||||||
|
* 1 array of input file names
|
||||||
|
* Call `render_post` with 1 file name as an argument
|
||||||
|
* Result is collected in a list.
|
||||||
|
|
||||||
|
So here is the diff to make that happen:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
diff --git a/main.py b/main.py
|
||||||
|
index 52ce57b..91d852b 100644
|
||||||
|
--- a/main.py
|
||||||
|
+++ b/main.py
|
||||||
|
@@ -1,9 +1,11 @@
|
||||||
|
+import os
|
||||||
|
import re
|
||||||
|
import glob
|
||||||
|
import html
|
||||||
|
import email
|
||||||
|
import logging
|
||||||
|
import datetime
|
||||||
|
+from multiprocessing import Pool, TimeoutError
|
||||||
|
from string import Template
|
||||||
|
|
||||||
|
import markdown
|
||||||
|
@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
|
||||||
|
logging.basicConfig(encoding='utf-8', level=logging.INFO)
|
||||||
|
|
||||||
|
md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
|
||||||
|
+cpu_count = os.cpu_count()
|
||||||
|
|
||||||
|
def convert(text):
|
||||||
|
md.reset()
|
||||||
|
@@ -52,11 +55,11 @@ def render_posts():
|
||||||
|
files = glob.glob('posts/*.md')
|
||||||
|
logging.info('found post files %s', files)
|
||||||
|
posts = []
|
||||||
|
- for fname in files:
|
||||||
|
- p = render_post(fname)
|
||||||
|
- posts.append(p)
|
||||||
|
- logging.info('rendered post: %s', p)
|
||||||
|
+ logging.info('starting render posts with cpu_count: %d', cpu_count)
|
||||||
|
+ with Pool(processes=cpu_count) as pool:
|
||||||
|
+ posts = pool.map(render_post, files)
|
||||||
|
|
||||||
|
+ logging.info("render_posts result: %s", res)
|
||||||
|
return posts
|
||||||
|
|
||||||
|
def posts_list_html(posts):
|
||||||
|
```
|
||||||
|
|
||||||
|
And re-run the benchmarks:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Run on a 16 core AMD Ryzen 7 7840U
|
||||||
|
❯ ./bench.sh 100
|
||||||
|
INFO: removing old __bench files
|
||||||
|
INFO: number of *.md files 102
|
||||||
|
INFO: number of *.html files 2
|
||||||
|
INFO: running
|
||||||
|
real 0.21
|
||||||
|
user 1.64
|
||||||
|
sys 0.14
|
||||||
|
INFO: number of *.html files 102
|
||||||
|
INFO: cleanup __bench files
|
||||||
|
|
||||||
|
❯ ./bench.sh 1000
|
||||||
|
INFO: removing old __bench files
|
||||||
|
INFO: number of *.md files 1002
|
||||||
|
INFO: number of *.html files 2
|
||||||
|
INFO: running
|
||||||
|
real 1.12
|
||||||
|
user 15.69
|
||||||
|
sys 0.24
|
||||||
|
INFO: number of *.html files 1002
|
||||||
|
INFO: cleanup __bench files
|
||||||
|
```
|
||||||
|
|
||||||
|
So down to <1s for 1000 posts with 16 cores (`multiprocessing.Pool` uses cpu core count by default if not passed) .
|
||||||
|
|
||||||
|
🎉 that's cool!
|
||||||
|
|
||||||
[1]: https://crystal-lang.org/
|
[1]: https://crystal-lang.org/
|
||||||
[2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0
|
[2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0
|
||||||
|
|
Loading…
Reference in a new issue