Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: HTML to Markdown lua filters #5054

Merged
merged 12 commits into from
Feb 7, 2025
29 changes: 19 additions & 10 deletions utils/grass_html2md.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ set -eu
# wget
#
# Author(s):
# Martin Landa, Markus Neteler
# Martin Landa, Markus Neteler, Corey White
#
# Usage:
# If you have "pandoc" in PATH, execute for HTML file conversion in
# current directory and subdirectories:
# ./utils/grass_html2md.sh
#
# COPYRIGHT: (C) 2024 by the GRASS Development Team
# COPYRIGHT: (C) 2024-2025 by the GRASS Development Team
#
# This program is free software under the GNU General Public
# License (>=v2). Read the file COPYING that comes with GRASS
Expand All @@ -43,6 +43,22 @@ trap "exitprocedure" 2 3 15
# path to LUA file (./utils/pandoc_codeblock.lua)
UTILSPATH="utils"

process_file() {
local file="$1" # temporary file
local f="$2" # original file

cat "$file" | \
sed 's#<div class="code"><pre>#<pre><code>#g' | \
sed 's#</pre></div>#</code></pre>#g' | \
pandoc -f html-native_divs \
-t gfm+pipe_tables+gfm_auto_identifiers --wrap=auto \
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"

rm -f "$file"

}

# run recursively: HTML to MD
for f in $(find . -name *.html); do
echo "${f}"
Expand All @@ -57,13 +73,6 @@ for f in $(find . -name *.html); do
s|_KEEPHTML||g;
' "${f%%.html}.html" > "${f%%.html}_tmp.html"

cat "${f%%.html}_tmp.html" | \
sed 's#<div class="code"><pre>#<pre><code>#g' | \
sed 's#</pre></div>#</code></pre>#g' | \
pandoc --from=html --to=markdown -t gfm \
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"

rm -f "${f%%.html}_tmp.html"
process_file "${f%%.html}_tmp.html" ${f%%.html}.html

done
70 changes: 67 additions & 3 deletions utils/pandoc_codeblock.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,71 @@
-- Test cases
-- raster/r.sun/r.sun.html

-- Function to convert code blocks to markdown
function CodeBlock (cb)
return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n')
-- Enforces markdownlint rules during Pandoc conversion
local MAX_LINE_LENGTH = 120 -- Adjust as needed for MD013

local LIST_INDENT = ""

function Image(el)
-- Convert HTML <img> to Markdown ![alt text](src)
local alt_text = el.alt or "image-alt"
local src = el.src
return pandoc.Image({pandoc.Str(alt_text)}, src)
end

-- Fixes some edge cases with raw HTML elements
function RawInline(el)
if el.format == "html" then
if el.text:match("<em>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("</em>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("<i>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("</i>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("&nbsp;") then
return pandoc.RawInline("markdown", " ")
elseif el.text:match("&lt;") then
return pandoc.RawInline("markdown", "<")
elseif el.text:match("&gt;") then
return pandoc.RawInline("markdown", ">")
end
end
return el
end

function CodeBlock(el)
-- Ensure fenced code blocks with backticks
local lang = el.classes[1] or "sh" -- Preserve language if available
return pandoc.RawBlock("markdown", "```" .. lang .. "\n" .. el.text .. "\n```")
end

function Header(el)
return pandoc.Header(el.level, el.content) -- Ensure ATX-style headers
end

function Str(el)
local text = el.text:gsub("%s+$", "") -- Remove trailing spaces
return pandoc.Str(text)
end

function Pandoc(doc)
-- Process document with defined rules
local new_blocks = {}
local previous_blank = false

for _, block in ipairs(doc.blocks) do
if block.t == "Para" and #block.content == 0 then
if not previous_blank then
table.insert(new_blocks, block)
end
previous_blank = true
else
table.insert(new_blocks, block)
previous_blank = false
end
end

return pandoc.Pandoc(new_blocks)
end
Loading