/url updated to actually work again.

2025-07-28 17:48:04 -04:00
parent 2a409645de
commit 5197af9eba
1 changed files with 166 additions and 63 deletions
@@ -176,7 +176,9 @@ BEGIN {
 # lists=                                 # Comma-separated list of lists to follow
 # === URL HANDLING ===
-# urlopen=echo %U                        # Command to open URLs (%U = URL)
+# urlopen=echo %U                        # Command to open URLs (%U = URL) - deprecated, use cli_browser/gui_browser
 # cli_browser=                           # CLI browser (w3m, elinks, lynx) - auto-detected if empty
 # gui_browser=                           # GUI browser (xdg-open, firefox, etc) - auto-detected if empty
 # shoreblogurl=http://is.gd/api.php?longurl=  # URL shortening service
 # === SYSTEM SETTINGS ===
@@ -265,7 +267,7 @@ EOF
 		rlurl noprompt shoreblogurl newline wrap verify autosplit
 		notimeline queryurl fediverseserver colourprompt colourme
 		colourdm colourreply colourwarn coloursearch colourlist idurl
-		urlopen delurl notrack dmdelurl favsurl
+		urlopen cli_browser gui_browser delurl notrack dmdelurl favsurl
 		favurl favdelurl slowpost notifies filter colourdefault
 		followurl leaveurl dmupdate mentions backload
 		lat long location searchhits blockurl blockdelurl woeid
@@ -1080,6 +1082,8 @@ $pause = (($anonymous) ? 120 : "auto") if (!defined $pause);
 $superverbose ||= 0;
 $avatar ||= "";
 $urlopen ||= 'echo %U';
 $cli_browser ||= '';
 $gui_browser ||= '';
 $hold ||= 0;
 $daemon ||= 0;
 $maxhist ||= 19;
@@ -3362,70 +3366,113 @@ EOF
 			$genurl = $idurl;
 		} 
-		# to be TOS-compliant, we must try entities first to use
+		# Extract URLs from Mastodon post content
-		# t.co wrapped links. this is a tiny version of /entities.
+		my $didprint = 0;
-		unless ($notco) {
+		
-			my $id = $post->{'reblog'}->{'id_str'}
+		# First, try to get the FULL post data from the API to access structured URL fields
-				|| $post->{'id_str'};
+		my $full_post = undef;
-			my $hash;
+		if (defined($genurl) && defined($post->{'id_str'})) {
-
+			my $post_id = $post->{'id_str'};
-			# only fetch if we have to. if we already fetched
+			# Replace %I placeholder with actual post ID
-			# because we were given a direct id_str instead of a
+			my $api_url = $genurl;
-			# menu code, then we already have the entities.
+			$api_url =~ s/%I/$post_id/g;
-			if ($code !~ /^[0-9]+$/) {
+			print STDERR "-- DEBUG: Fetching full post data from API for URL extraction: $api_url\n" if ($superverbose);
-				$hash = &grabjson("${genurl}?id=${id}",
+			$full_post = &grabjson($api_url, 0, 0, 0, undef, 1);
-					0, 0, 0, undef, 1);
+		}
-			} else {
+		
-				# MAKE MONEY FAST WITH OUR QUICK CACHE PLAN
+		# Use full post data if available, otherwise fall back to cached data
-				$hash = $post;
+		my $working_post = (defined($full_post) && ref($full_post) eq 'HASH') ? $full_post : $post;
 		# DEBUG: Show what we have in the post structure
 		print STDERR "-- DEBUG: URL parsing - using " . (defined($full_post) ? "FULL" : "CACHED") . " post data\n" if ($superverbose);
 		print STDERR "-- DEBUG: URL parsing - post keys: " . join(", ", keys %$working_post) . "\n" if ($superverbose);
 		print STDERR "-- DEBUG: URL parsing - content field: '" . ($working_post->{'content'} || 'UNDEFINED') . "'\n" if ($superverbose);
 		print STDERR "-- DEBUG: URL parsing - text field: '" . ($working_post->{'text'} || 'UNDEFINED') . "'\n" if ($superverbose);
 		# PRIORITY 1: Extract URLs from Mastodon's structured URL data (card/preview_url)
 		if (defined($working_post->{'card'}) && ref($working_post->{'card'}) eq 'HASH') {
 			my $card = $working_post->{'card'};
 			if (defined($card->{'url'}) && length($card->{'url'})) {
 				print STDERR "-- DEBUG: Found card URL: " . $card->{'url'} . "\n" if ($superverbose);
 				&openurl($card->{'url'});
 				$didprint++;
 			}
-			if (defined($hash) && ref($hash) eq 'HASH') {
+		}
-				my $w;
+		
-				my $v;
+		# PRIORITY 2: Extract URLs from preview_url field (some servers use this)
-				my $didprint = 0;
+		if (!$didprint && defined($working_post->{'preview_url'}) && length($working_post->{'preview_url'})) {
-
+			print STDERR "-- DEBUG: Found preview_url: " . $working_post->{'preview_url'} . "\n" if ($superverbose);
-				# fediverse puts entities in multiple fields.
+			&openurl($working_post->{'preview_url'});
-				foreach $w (qw(media urls)) {
+			$didprint++;
-					my $p = $hash->{'entities'}->{$w};
+		}
-					next if (!defined($p) ||
+		
-						ref($p) ne 'ARRAY');
+		# PRIORITY 3: Media attachments
-					foreach $v (@{ $p }) {
+		if (!$didprint && defined($working_post->{'media_attachments'}) && 
-						next if (!defined($v) ||
+		    ref($working_post->{'media_attachments'}) eq 'ARRAY') {
-							ref($v) ne 'HASH');
+			foreach my $media (@{ $working_post->{'media_attachments'} }) {
-						next if (!length($v->{'url'}) ||
+				if (defined($media->{'url'}) && length($media->{'url'})) {
-							(!length($v->{'expanded_url'}) &&
+					print STDERR "-- DEBUG: Found media URL: " . $media->{'url'} . "\n" if ($superverbose);
-					 		!length($v->{'media_url'})));
+					&openurl($media->{'url'});
-						my $u1 = &descape($v->{'url'});
+					$didprint++;
 						&openurl($u1);
 						$didprint++;
 					}
 				}
 				print $stdout
 				"-- sorry, couldn't find any URL.\n"
 					if (!$didprint);
 				return 0;
 			}
 			print $stdout
 				"-- unable to use t.co URLs, using fallback\n";
 		}
-		# that failed, so fall back on the old method.
+		
-		my $text = &descape($post->{'text'});
+		# PRIORITY 4: Parse URLs from HTML content (href attributes)
-		# findallurls
+		if (!$didprint) {
-		while ($text
+			my $content = $working_post->{'content'} || $working_post->{'text'} || '';
-	=~ s#(h?ttp|h?ttps|ftp|gopher)://([a-zA-Z0-9_~/:%\-\+\.\=\&\?\#,]+)##){
+			if (length($content)) {
-# sigh. I HATE YOU TINYARRO.WS
+				# Extract URLs from href attributes in HTML
-#TODO
+				while ($content =~ s/<a[^>]+href=["']([^"']+)["'][^>]*>[^<]*<\/a>//i) {
-# eventually we will have to put a punycode implementation into openurl
+					my $url = $1;
-# to handle things like Mac OS X's open which don't understand UTF-8 URLs.
+					next if ($url =~ /^#/); # Skip hashtag links
-# when we do, uncomment this again
+					next if ($url =~ /^\@/); # Skip mention links
-#	=~ s#(http|https|ftp|gopher)://([^'\\]+?)('|\\|\s|$)##) {
+					print STDERR "-- DEBUG: Found HTML href URL: " . $url . "\n" if ($superverbose);
-			my $url = $1 . "://$2";
+					&openurl($url);
-			$url = "h$url" if ($url =~ /^ttps?:/);
+					$didprint++;
-			$url =~ s/[\.\?]$//;
+				}
-			&openurl($url);
+			}
 		}
 		if ($didprint) {
 			return 0;
 		}
 		# PRIORITY 5: Final fallback - parse plain text URLs from display text
 		# This handles truncated URLs in display text as a last resort
 		if (!$didprint) {
 			# Re-get content since we may have modified it above with regex substitutions
 			my $original_content = $working_post->{'content'} || $working_post->{'text'} || '';
 			my $plain_content = &html_to_text($original_content);
 			$plain_content = &descape($plain_content);
 			print STDERR "-- DEBUG: Parsing plain text content: '$plain_content'\n" if ($superverbose);
 			# findallurls - extract any remaining URLs from plain text
 			# First try URLs with protocols
 			while ($plain_content
 		=~ s#(h?ttp|h?ttps|ftp|gopher)://([a-zA-Z0-9_~/:%\-\+\.\=\&\?\#,]+)##){
 				my $url = $1 . "://$2";
 				$url = "h$url" if ($url =~ /^ttps?:/);
 				$url =~ s/[\.\?]$//;
 				print STDERR "-- DEBUG: Found plain text URL with protocol: " . $url . "\n" if ($superverbose);
 				&openurl($url);
 				$didprint++;
 			}
 			# Then try URLs without protocols (assume https) - WARNING: may be truncated
 			while ($plain_content
 		=~ s#\b([a-zA-Z0-9\-]+\.[a-zA-Z]{2,}(?:/[a-zA-Z0-9_~/%:\-\+\.\=\&\?\#,]*)?)\b##){
 				my $url = "https://$1";
 				$url =~ s/[\.\?]$//;
 				print STDERR "-- DEBUG: Found plain text URL without protocol (may be truncated): " . $url . "\n" if ($superverbose);
 				print $stdout "-- WARNING: URL may be truncated from display text: $url\n";
 				&openurl($url);
 				$didprint++;
 			}
 		}
 		print $stdout "-- sorry, couldn't find any URL.\n"
-			if (!defined($urlshort));
+			if (!$didprint);
 		return 0;
 	}
@@ -7932,15 +7979,71 @@ sub generate_shortdomain {
 		if (!length($shoreblogurldomain));
 }
 sub detect_browser {
 	my $is_gui = defined($ENV{'DISPLAY'}) && length($ENV{'DISPLAY'});
 	my $browser;
 	if ($is_gui) {
 		# GUI environment - check gui_browser setting first
 		if (length($gui_browser)) {
 			return ($gui_browser, 1);  # Return browser and background flag
 		}
 		# Try xdg-open first (most reliable on Linux)
 		if (system("which xdg-open >/dev/null 2>&1") == 0) {
 			return ('xdg-open %U', 1);
 		}
 		# Fallback GUI browsers
 		my @gui_browsers = qw(brave chromium firefox google-chrome);
 		foreach my $br (@gui_browsers) {
 			if (system("which $br >/dev/null 2>&1") == 0) {
 				return ("$br %U", 1);
 			}
 		}
 		# If no GUI browser found, fall back to CLI
 		print STDERR "-- warning: no GUI browser found, falling back to CLI browser\n";
 	}
 	# CLI environment or fallback - check cli_browser setting first
 	if (length($cli_browser)) {
 		return ($cli_browser, 0);  # CLI browsers are blocking
 	}
 	# Try CLI browsers in order of preference
 	my @cli_browsers = qw(w3m elinks lynx);
 	foreach my $br (@cli_browsers) {
 		if (system("which $br >/dev/null 2>&1") == 0) {
 			return ("$br %U", 0);
 		}
 	}
 	# Ultimate fallback - use the old urlopen setting
 	return ($urlopen, 0);
 }
 sub openurl {
 	my $comm = $urlopen;
 	my $url = shift;
 	my ($comm, $should_background) = &detect_browser();
 	# Handle gopher URLs through gateway if not using lynx
 	$url = "http://gopher.floodgap.com/gopher/gw?".&url_oauth_sub($url)
 		if ($url =~ m#^gopher://# && $comm !~ /^[^\s]*lynx/);
 	$urlshort = $url;
 	$comm =~ s/\%U/'$url'/g;
-	print $stdout "($comm)\n";
+	
-	system("$comm");
+	if ($should_background) {
 		# Background GUI browsers
 		$comm .= " &";
 		print $stdout "($comm)\n";
 		system("$comm");
 	} else {
 		# CLI browsers run in foreground
 		print $stdout "($comm)\n";
 		system("$comm");
 	}
 }
 sub urlshorten {