From f0266f51ab36bf389415bcac9fe26c8084c67bdd Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Sun, 7 Nov 2010 15:45:50 +0100 Subject: [PATCH] add "extractfeedurls" rpc action that extracts the feed URLs from a HTML page --- functions.php | 53 +++++++++++++++++++++++++++++++++++------ modules/backend-rpc.php | 10 ++++++++ 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/functions.php b/functions.php index ae37e7d84..2373d5435 100644 --- a/functions.php +++ b/functions.php @@ -2943,15 +2943,9 @@ function subscribe_to_feed($link, $url, $cat_id = 0, $auth_login = '', $auth_pass = '') { - $url = fix_url($url); - $parts = parse_url($url); - + $url = fix_url($url); if (!validate_feed_url($url)) return 2; - if ($parts['scheme'] == 'feed') $parts['scheme'] = 'http'; - - $url = make_url_from_parts($parts); - if ($cat_id == "0" || !$cat_id) { $cat_qpart = "NULL"; } else { @@ -6674,6 +6668,8 @@ /** * Fixes incomplete URLs by prepending "http://". + * Also replaces feed:// with http://, and + * prepends a trailing slash if the url is a domain name only. * * @param string $url Possibly incomplete URL * @@ -6682,6 +6678,14 @@ function fix_url($url) { if (strpos($url, '://') === false) { $url = 'http://' . $url; + } else if (substr($url, 0, 5) == 'feed:') { + $url = 'http:' . substr($url, 5); + } + + //prepend slash if the URL has no slash in it + // "http://www.example" -> "http://www.example/" + if (strpos($url, '/', 7) === false) { + $url .= '/'; } return $url; } @@ -6973,4 +6977,39 @@ } return false; } + + /** + * Extracts RSS/Atom feed URLs from the given HTML URL. + * + * @param string $url HTML page URL + * + * @return array Array of feeds. Key is the full URL, value the title + */ + function get_feeds_from_html($url) + { + $url = fix_url($url); + $baseUrl = substr($url, 0, strrpos($url, '/') + 1); + + $doc = new DOMDocument(); + $doc->loadHTMLFile($url); + $xpath = new DOMXPath($doc); + $entries = $xpath->query('/html/head/link[@rel="alternate"]'); + $feedUrls = array(); + foreach ($entries as $entry) { + if ($entry->hasAttribute('href')) { + $title = $entry->getAttribute('title'); + if ($title == '') { + $title = $entry->getAttribute('type'); + } + $feedUrl = $entry->getAttribute('href'); + if (strpos($feedUrl, '://') === false) { + //no protocol -> relative URL + $feedUrl = $baseUrl . $feedUrl; + } + $feedUrls[$feedUrl] = $title; + } + } + return $feedUrls; + } + ?> diff --git a/modules/backend-rpc.php b/modules/backend-rpc.php index 7ccb30692..54d636d18 100644 --- a/modules/backend-rpc.php +++ b/modules/backend-rpc.php @@ -123,6 +123,16 @@ } + if ($subop == "extractfeedurls") { + print ""; + + $urls = get_feeds_from_html($_REQUEST['url']); + print ""; + + print ""; + return; + } + if ($subop == "togglepref") { print "";