build_a_blog: safer markdown

2024-06-19 16:08:40 -04:00 · 2024-06-19 16:08:40 -04:00 · 906a5e04ea
commit 906a5e04ea
parent 3e06bfd9ae
2 changed files with 65 additions and 21 deletions
--- a/main.py
+++ b/main.py
@ -14,12 +14,12 @@ from markdown.extensions.toc import TocExtension
 destpath_re = re.compile(r'\.md$')
 logging.basicConfig(encoding='utf-8', level=logging.INFO)

-md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
 cpu_count = os.cpu_count()

 def convert(text):
-	md.reset()
-	return md.convert(text)
+	md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
+	res = md.convert(text)
+	return res, md.Meta

 def render_post(fpath):
 	destpath = destpath_re.sub('.html', fpath)
@ -30,15 +30,16 @@ def render_post(fpath):
 		text = input_file.read()

 	logging.info("parsing %s", fpath)
-	out = convert(text)
+	out, meta = convert(text)

-	title = md.Meta.get('title')[0]
-	date = md.Meta.get('date')[0]
+	title = meta.get('title')[0]
+	date = meta.get('date')[0]
 	draft = False
-	if md.Meta.get('draft'):
+	if meta.get('draft'):
 		draft = True

-	out = convert('# ' + title) + out
+	title_out, _ = convert(text)
+	out = title_out + out

 	logging.info("writing to %s", destpath)
 	render_template('index.html.tmpl', destpath, {'content': out, 'more_title': ' - ' + title})
@ -105,7 +106,7 @@ def rss_post_xml(post):
 		text = inf.read()


-	converted = convert(text)
+	converted, _ = convert(text)

 	pubdate = email.utils.format_datetime(datetime.datetime.fromisoformat(post['date']))
 	subs = {
--- a/posts/build_a_blog.md
+++ b/posts/build_a_blog.md
@ -812,7 +812,7 @@ So here is the diff to make that happen:

 ```diff
 diff --git a/main.py b/main.py
-index 52ce57b..95b650d 100644
+index 52ce57b..a3d92d1 100644
 --- a/main.py
 +++ b/main.py
@@ -1,9 +1,11 @@
@ -827,15 +827,45 @@ index 52ce57b..95b650d 100644
 from string import Template

 import markdown
-@@ -13,6 +15,7 @@ destpath_re = re.compile(r'\.md$')
+@@ -12,11 +14,12 @@ from markdown.extensions.toc import TocExtension
+ destpath_re = re.compile(r'\.md$')
 logging.basicConfig(encoding='utf-8', level=logging.INFO)

- md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
+-md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
 +cpu_count = os.cpu_count()

 def convert(text):
- 	md.reset()
-@@ -52,11 +55,11 @@ def render_posts():
+-	md.reset()
+-	return md.convert(text)
+	md = markdown.Markdown(extensions=['extra', 'meta', TocExtension(anchorlink=True)])
+	res = md.convert(text)
+	return res, md.Meta
+
+ def render_post(fpath):
+ 	destpath = destpath_re.sub('.html', fpath)
+@@ -27,15 +30,16 @@ def render_post(fpath):
+ 		text = input_file.read()
+
+ 	logging.info("parsing %s", fpath)
+-	out = convert(text)
+	out, meta = convert(text)
+
+-	title = md.Meta.get('title')[0]
+-	date = md.Meta.get('date')[0]
+	title = meta.get('title')[0]
+	date = meta.get('date')[0]
+ 	draft = False
+-	if md.Meta.get('draft'):
+	if meta.get('draft'):
+ 		draft = True
+
+-	out = convert('# ' + title) + out
+	title_out, _ = convert(text)
+	out = title_out + out
+
+ 	logging.info("writing to %s", destpath)
+ 	render_template('index.html.tmpl', destpath, {'content': out, 'more_title': ' - ' + title})
+@@ -52,11 +56,11 @@ def render_posts():
 	files = glob.glob('posts/*.md')
 	logging.info('found post files %s', files)
 	posts = []
@ -851,8 +881,21 @@ index 52ce57b..95b650d 100644
 	return posts

 def posts_list_html(posts):
+@@ -102,7 +106,7 @@ def rss_post_xml(post):
+ 		text = inf.read()
+
+
+-	converted = convert(text)
+	converted, _ = convert(text)
+
+ 	pubdate = email.utils.format_datetime(datetime.datetime.fromisoformat(post['date']))
+ 	subs = {
 ```

+The biggest note is that `convert()` now creates a `Markdown` instance on each call. This protects against multiple processes trying to use the same module level `md`
+
+See <https://python-markdown.github.io/reference/#Markdown> for notes on how `Markdown.reset()` and thread safety.
+
 And re-run the benchmarks:

 ```shell
@ -862,9 +905,9 @@ INFO: removing old __bench files
 INFO: number of *.md files 102
 INFO: number of *.html files 2
 INFO: running
-real 0.27
-user 1.82
-sys 0.15
+real 0.45
+user 4.21
+sys 0.32
 INFO: number of *.html files 102
 INFO: cleanup __bench files

@ -873,14 +916,14 @@ INFO: removing old __bench files
 INFO: number of *.md files 1002
 INFO: number of *.html files 2
 INFO: running
-real 1.25
-user 16.68
-sys 0.52
+real 2.52
+user 35.69
+sys 0.96
 INFO: number of *.html files 1002
 INFO: cleanup __bench files
 ```

-Down to 1.25s for 1000 posts 🎉
+So that's down to 2.5s for 1000 posts. Not a bad start!

 [1]: https://crystal-lang.org/
 [2]: https://github.com/crystal-lang/crystal/releases/tag/0.31.0