Use pandoc to remove html from titles.

This commit is contained in:
Storm Dragon 2020-08-10 09:35:15 -04:00
parent 43f93d0a90
commit 89a78fef32

View File

@ -2,16 +2,9 @@
for l in $3 ; do
if [[ "${l}" =~ http://|https://|www\..* ]]; then
pageTitle="$(curl -s --connect-timeout 5 "$l" | sed -n -e 'H;${x;s!.*<head[^>]*>\(.*\)</head>.*!\1!;T;s!.*<title>\(.*\)</title>.*!\1!p}')"
# Fix up pageTitle a bit.
pageTitle="${pageTitle//&#039;/\'}"
pageTitle="${pageTitle//&#8211;/-}"
shortLink="${l#*://}"
shortLink="${shortLink%%/*}"
if [ ${#pageTitle} -gt 1 ]; then
msg "$2" "$pageTitle at $shortLink"
pageTitle="$(curl -L -s --connect-timeout 5 "$l" | sed -n -e 'H;${x;s!.*<head[^>]*>\(.*\)</head>.*!\1!;T;s!.*<title>\(.*\)</title>.*!\1!p}' | pandoc -t plain | tr '[:space:]' ' ')"
if [[ ${#pageTitle} -gt 1 ]]; then
msg "$2" "$pageTitle"
fi
fi
done