Skip to content

Commit

Permalink
docs: script to convert HTML manual pages to markdown (#4620)
Browse files Browse the repository at this point in the history
  • Loading branch information
neteler authored Feb 5, 2025
1 parent 344096a commit 560e6d2
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 0 deletions.
69 changes: 69 additions & 0 deletions utils/grass_html2md.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
set -eu

###############################################################################
# Convert recursively all .html files to .md (GitHub flavoured Markdown)
#
# Dependencies:
# pandoc
# wget
#
# Author(s):
# Martin Landa, Markus Neteler
#
# Usage:
# If you have "pandoc" in PATH, execute for HTML file conversion in
# current directory and subdirectories:
# ./utils/grass_html2md.sh
#
# COPYRIGHT: (C) 2024 by the GRASS Development Team
#
# This program is free software under the GNU General Public
# License (>=v2). Read the file COPYING that comes with GRASS
# for details.
#
###############################################################################

# cleanup at user break
cleanup()
{
rm -f "${f%%.html}_tmp.html"
}

# what to do in case of user break:
exitprocedure()
{
echo "User break!"
cleanup
exit 1
}
# shell check for user break (signal list: trap -l)
trap "exitprocedure" 2 3 15

# path to LUA file (./utils/pandoc_codeblock.lua)
UTILSPATH="utils"

# run recursively: HTML to MD
for f in $(find . -name *.html); do
echo "${f}"

# HTML: Process the tmp file to selectively replace .html with .md only in relative URLs
sed -E '
# Step 1: Preserve http/https links with .html (and optional anchors)
s|(<a href="https?://[^"]+\.html)(#[^"]*)?">|\1_KEEPHTML\2">|g;
# Step 2: Replace .html with .md for local links (with or without anchors)
s|(<a href=")([^"]+)\.html(#[^"]*)?">|\1\2.md\3">|g;
# Step 3: Restore preserved http/https links with .html
s|_KEEPHTML||g;
' "${f%%.html}.html" > "${f%%.html}_tmp.html"

cat "${f%%.html}_tmp.html" | \
sed 's#<div class="code"><pre>#<pre><code>#g' | \
sed 's#</pre></div>#</code></pre>#g' | \
pandoc --from=html --to=markdown -t gfm \
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"

rm -f "${f%%.html}_tmp.html"

done
8 changes: 8 additions & 0 deletions utils/pandoc_codeblock.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- Pandoc Lua filter to handle code blocks
-- Test cases
-- raster/r.sun/r.sun.html

-- Function to convert code blocks to markdown
function CodeBlock (cb)
return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n')
end

0 comments on commit 560e6d2

Please sign in to comment.