Poor man's Reader App with Pandoc & Bash
Every so often, I want to avoid opening a website in a browser, for ... reasons.
Throwback, June 27, 2020.
Every so often, I want to avoid opening a website in a browser, for … reasons.
Curl alone presents too much html. I want to try and read stuff.
Today, I was playing with Igor Chubin's awesome terminal services (wttr.in, cht.sh etc.), and it hit me:
"WAIT, there's pandoc, what if I just … "
—
… … … and an hour later… a terrible idea manifested itself.
www_to_md() {
pandoc --wrap=none -f html -t markdown "${1}"
}
drop_noise() {
# remove pesky divs
grep -v -E "(<div|</div).*[>]?|*.>" |
# squeeze multiple blank lines into one
cat -s
}
cache_site() {
local sitecache="${1:?'Fail. Path to create cache.'}"
local mdfilename="${2:-'this.md'}"
local evict_cache_qmark="${3:-no}"
mkdir -p "${sitecache}"
if [[ -f "${sitecache}/${mdfilename}" && "${evict_cache_qmark}" == "no" ]]
then tee
else tee "${sitecache}/${mdfilename}"
fi
}
panwww() {
local siteurl="${1}"
local evict_cache_qmark="${2:-no}"
local sitename="${siteurl/http*:\/\//www.}"
local sitecache="/tmp/panwwwcache/${sitename}"
local mdfilename="this.md"
if [[ -f "${sitecache}/${mdfilename}" && "${evict_cache_qmark}" == "no" ]]
then local cmd="cat ${sitecache}/${mdfilename}"
else local cmd="www_to_md ${siteurl}"
fi
$cmd | drop_noise | cache_site "${sitecache}" "${mdfilename}" "${evict_cache_qmark}"
}
so that …
panwww "https://www.recurse.com/" | less # fetches site the first time
panwww "https://www.recurse.com/" | less # looks up "cache"
panwww "https://www.recurse.com/" "refetch" | less