/url updated to actually work again.

This commit is contained in:
Storm Dragon
2025-07-28 17:48:04 -04:00
parent 2a409645de
commit 5197af9eba
+166 -63
View File
@@ -176,7 +176,9 @@ BEGIN {
# lists= # Comma-separated list of lists to follow
# === URL HANDLING ===
# urlopen=echo %U # Command to open URLs (%U = URL)
# urlopen=echo %U # Command to open URLs (%U = URL) - deprecated, use cli_browser/gui_browser
# cli_browser= # CLI browser (w3m, elinks, lynx) - auto-detected if empty
# gui_browser= # GUI browser (xdg-open, firefox, etc) - auto-detected if empty
# shoreblogurl=http://is.gd/api.php?longurl= # URL shortening service
# === SYSTEM SETTINGS ===
@@ -265,7 +267,7 @@ EOF
rlurl noprompt shoreblogurl newline wrap verify autosplit
notimeline queryurl fediverseserver colourprompt colourme
colourdm colourreply colourwarn coloursearch colourlist idurl
urlopen delurl notrack dmdelurl favsurl
urlopen cli_browser gui_browser delurl notrack dmdelurl favsurl
favurl favdelurl slowpost notifies filter colourdefault
followurl leaveurl dmupdate mentions backload
lat long location searchhits blockurl blockdelurl woeid
@@ -1080,6 +1082,8 @@ $pause = (($anonymous) ? 120 : "auto") if (!defined $pause);
$superverbose ||= 0;
$avatar ||= "";
$urlopen ||= 'echo %U';
$cli_browser ||= '';
$gui_browser ||= '';
$hold ||= 0;
$daemon ||= 0;
$maxhist ||= 19;
@@ -3362,70 +3366,113 @@ EOF
$genurl = $idurl;
}
# to be TOS-compliant, we must try entities first to use
# t.co wrapped links. this is a tiny version of /entities.
unless ($notco) {
my $id = $post->{'reblog'}->{'id_str'}
|| $post->{'id_str'};
my $hash;
# only fetch if we have to. if we already fetched
# because we were given a direct id_str instead of a
# menu code, then we already have the entities.
if ($code !~ /^[0-9]+$/) {
$hash = &grabjson("${genurl}?id=${id}",
0, 0, 0, undef, 1);
} else {
# MAKE MONEY FAST WITH OUR QUICK CACHE PLAN
$hash = $post;
# Extract URLs from Mastodon post content
my $didprint = 0;
# First, try to get the FULL post data from the API to access structured URL fields
my $full_post = undef;
if (defined($genurl) && defined($post->{'id_str'})) {
my $post_id = $post->{'id_str'};
# Replace %I placeholder with actual post ID
my $api_url = $genurl;
$api_url =~ s/%I/$post_id/g;
print STDERR "-- DEBUG: Fetching full post data from API for URL extraction: $api_url\n" if ($superverbose);
$full_post = &grabjson($api_url, 0, 0, 0, undef, 1);
}
# Use full post data if available, otherwise fall back to cached data
my $working_post = (defined($full_post) && ref($full_post) eq 'HASH') ? $full_post : $post;
# DEBUG: Show what we have in the post structure
print STDERR "-- DEBUG: URL parsing - using " . (defined($full_post) ? "FULL" : "CACHED") . " post data\n" if ($superverbose);
print STDERR "-- DEBUG: URL parsing - post keys: " . join(", ", keys %$working_post) . "\n" if ($superverbose);
print STDERR "-- DEBUG: URL parsing - content field: '" . ($working_post->{'content'} || 'UNDEFINED') . "'\n" if ($superverbose);
print STDERR "-- DEBUG: URL parsing - text field: '" . ($working_post->{'text'} || 'UNDEFINED') . "'\n" if ($superverbose);
# PRIORITY 1: Extract URLs from Mastodon's structured URL data (card/preview_url)
if (defined($working_post->{'card'}) && ref($working_post->{'card'}) eq 'HASH') {
my $card = $working_post->{'card'};
if (defined($card->{'url'}) && length($card->{'url'})) {
print STDERR "-- DEBUG: Found card URL: " . $card->{'url'} . "\n" if ($superverbose);
&openurl($card->{'url'});
$didprint++;
}
if (defined($hash) && ref($hash) eq 'HASH') {
my $w;
my $v;
my $didprint = 0;
# fediverse puts entities in multiple fields.
foreach $w (qw(media urls)) {
my $p = $hash->{'entities'}->{$w};
next if (!defined($p) ||
ref($p) ne 'ARRAY');
foreach $v (@{ $p }) {
next if (!defined($v) ||
ref($v) ne 'HASH');
next if (!length($v->{'url'}) ||
(!length($v->{'expanded_url'}) &&
!length($v->{'media_url'})));
my $u1 = &descape($v->{'url'});
&openurl($u1);
$didprint++;
}
}
# PRIORITY 2: Extract URLs from preview_url field (some servers use this)
if (!$didprint && defined($working_post->{'preview_url'}) && length($working_post->{'preview_url'})) {
print STDERR "-- DEBUG: Found preview_url: " . $working_post->{'preview_url'} . "\n" if ($superverbose);
&openurl($working_post->{'preview_url'});
$didprint++;
}
# PRIORITY 3: Media attachments
if (!$didprint && defined($working_post->{'media_attachments'}) &&
ref($working_post->{'media_attachments'}) eq 'ARRAY') {
foreach my $media (@{ $working_post->{'media_attachments'} }) {
if (defined($media->{'url'}) && length($media->{'url'})) {
print STDERR "-- DEBUG: Found media URL: " . $media->{'url'} . "\n" if ($superverbose);
&openurl($media->{'url'});
$didprint++;
}
print $stdout
"-- sorry, couldn't find any URL.\n"
if (!$didprint);
return 0;
}
print $stdout
"-- unable to use t.co URLs, using fallback\n";
}
# that failed, so fall back on the old method.
my $text = &descape($post->{'text'});
# findallurls
while ($text
=~ s#(h?ttp|h?ttps|ftp|gopher)://([a-zA-Z0-9_~/:%\-\+\.\=\&\?\#,]+)##){
# sigh. I HATE YOU TINYARRO.WS
#TODO
# eventually we will have to put a punycode implementation into openurl
# to handle things like Mac OS X's open which don't understand UTF-8 URLs.
# when we do, uncomment this again
# =~ s#(http|https|ftp|gopher)://([^'\\]+?)('|\\|\s|$)##) {
my $url = $1 . "://$2";
$url = "h$url" if ($url =~ /^ttps?:/);
$url =~ s/[\.\?]$//;
&openurl($url);
# PRIORITY 4: Parse URLs from HTML content (href attributes)
if (!$didprint) {
my $content = $working_post->{'content'} || $working_post->{'text'} || '';
if (length($content)) {
# Extract URLs from href attributes in HTML
while ($content =~ s/<a[^>]+href=["']([^"']+)["'][^>]*>[^<]*<\/a>//i) {
my $url = $1;
next if ($url =~ /^#/); # Skip hashtag links
next if ($url =~ /^\@/); # Skip mention links
print STDERR "-- DEBUG: Found HTML href URL: " . $url . "\n" if ($superverbose);
&openurl($url);
$didprint++;
}
}
}
if ($didprint) {
return 0;
}
# PRIORITY 5: Final fallback - parse plain text URLs from display text
# This handles truncated URLs in display text as a last resort
if (!$didprint) {
# Re-get content since we may have modified it above with regex substitutions
my $original_content = $working_post->{'content'} || $working_post->{'text'} || '';
my $plain_content = &html_to_text($original_content);
$plain_content = &descape($plain_content);
print STDERR "-- DEBUG: Parsing plain text content: '$plain_content'\n" if ($superverbose);
# findallurls - extract any remaining URLs from plain text
# First try URLs with protocols
while ($plain_content
=~ s#(h?ttp|h?ttps|ftp|gopher)://([a-zA-Z0-9_~/:%\-\+\.\=\&\?\#,]+)##){
my $url = $1 . "://$2";
$url = "h$url" if ($url =~ /^ttps?:/);
$url =~ s/[\.\?]$//;
print STDERR "-- DEBUG: Found plain text URL with protocol: " . $url . "\n" if ($superverbose);
&openurl($url);
$didprint++;
}
# Then try URLs without protocols (assume https) - WARNING: may be truncated
while ($plain_content
=~ s#\b([a-zA-Z0-9\-]+\.[a-zA-Z]{2,}(?:/[a-zA-Z0-9_~/%:\-\+\.\=\&\?\#,]*)?)\b##){
my $url = "https://$1";
$url =~ s/[\.\?]$//;
print STDERR "-- DEBUG: Found plain text URL without protocol (may be truncated): " . $url . "\n" if ($superverbose);
print $stdout "-- WARNING: URL may be truncated from display text: $url\n";
&openurl($url);
$didprint++;
}
}
print $stdout "-- sorry, couldn't find any URL.\n"
if (!defined($urlshort));
if (!$didprint);
return 0;
}
@@ -7932,15 +7979,71 @@ sub generate_shortdomain {
if (!length($shoreblogurldomain));
}
sub detect_browser {
my $is_gui = defined($ENV{'DISPLAY'}) && length($ENV{'DISPLAY'});
my $browser;
if ($is_gui) {
# GUI environment - check gui_browser setting first
if (length($gui_browser)) {
return ($gui_browser, 1); # Return browser and background flag
}
# Try xdg-open first (most reliable on Linux)
if (system("which xdg-open >/dev/null 2>&1") == 0) {
return ('xdg-open %U', 1);
}
# Fallback GUI browsers
my @gui_browsers = qw(brave chromium firefox google-chrome);
foreach my $br (@gui_browsers) {
if (system("which $br >/dev/null 2>&1") == 0) {
return ("$br %U", 1);
}
}
# If no GUI browser found, fall back to CLI
print STDERR "-- warning: no GUI browser found, falling back to CLI browser\n";
}
# CLI environment or fallback - check cli_browser setting first
if (length($cli_browser)) {
return ($cli_browser, 0); # CLI browsers are blocking
}
# Try CLI browsers in order of preference
my @cli_browsers = qw(w3m elinks lynx);
foreach my $br (@cli_browsers) {
if (system("which $br >/dev/null 2>&1") == 0) {
return ("$br %U", 0);
}
}
# Ultimate fallback - use the old urlopen setting
return ($urlopen, 0);
}
sub openurl {
my $comm = $urlopen;
my $url = shift;
my ($comm, $should_background) = &detect_browser();
# Handle gopher URLs through gateway if not using lynx
$url = "http://gopher.floodgap.com/gopher/gw?".&url_oauth_sub($url)
if ($url =~ m#^gopher://# && $comm !~ /^[^\s]*lynx/);
$urlshort = $url;
$comm =~ s/\%U/'$url'/g;
print $stdout "($comm)\n";
system("$comm");
if ($should_background) {
# Background GUI browsers
$comm .= " &";
print $stdout "($comm)\n";
system("$comm");
} else {
# CLI browsers run in foreground
print $stdout "($comm)\n";
system("$comm");
}
}
sub urlshorten {