Use pandoc to remove html from titles.
This commit is contained in:
parent
43f93d0a90
commit
89a78fef32
@ -2,16 +2,9 @@
|
||||
|
||||
for l in $3 ; do
|
||||
if [[ "${l}" =~ http://|https://|www\..* ]]; then
|
||||
pageTitle="$(curl -s --connect-timeout 5 "$l" | sed -n -e 'H;${x;s!.*<head[^>]*>\(.*\)</head>.*!\1!;T;s!.*<title>\(.*\)</title>.*!\1!p}')"
|
||||
|
||||
# Fix up pageTitle a bit.
|
||||
pageTitle="${pageTitle//'/\'}"
|
||||
pageTitle="${pageTitle//–/-}"
|
||||
|
||||
shortLink="${l#*://}"
|
||||
shortLink="${shortLink%%/*}"
|
||||
if [ ${#pageTitle} -gt 1 ]; then
|
||||
msg "$2" "$pageTitle at $shortLink"
|
||||
pageTitle="$(curl -L -s --connect-timeout 5 "$l" | sed -n -e 'H;${x;s!.*<head[^>]*>\(.*\)</head>.*!\1!;T;s!.*<title>\(.*\)</title>.*!\1!p}' | pandoc -t plain | tr '[:space:]' ' ')"
|
||||
if [[ ${#pageTitle} -gt 1 ]]; then
|
||||
msg "$2" "$pageTitle"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
Loading…
Reference in New Issue
Block a user