|
|
@ -137,29 +137,11 @@ class Af_Readability extends Plugin {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public function extract_content($url) {
|
|
|
|
public function extract_content($url) {
|
|
|
|
if (!class_exists("Readability")) require_once(dirname(dirname(__DIR__)). "/lib/readability/Readability.php");
|
|
|
|
global $fetch_effective_url;
|
|
|
|
|
|
|
|
|
|
|
|
if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$ch = curl_init($url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_HEADER, true);
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_NOBODY, true);
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
|
|
|
|
|
|
|
curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@curl_exec($ch);
|
|
|
|
|
|
|
|
$content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (strpos($content_type, "text/html") === FALSE)
|
|
|
|
if (!class_exists("Readability")) require_once(dirname(dirname(__DIR__)). "/lib/readability/Readability.php");
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$tmp = fetch_file_contents($url);
|
|
|
|
$tmp = fetch_file_contents(array("url" => $url, "type" => "text/html"));
|
|
|
|
|
|
|
|
|
|
|
|
if ($tmp && mb_strlen($tmp) < 1024 * 500) {
|
|
|
|
if ($tmp && mb_strlen($tmp) < 1024 * 500) {
|
|
|
|
$tmpdoc = new DOMDocument("1.0", "UTF-8");
|
|
|
|
$tmpdoc = new DOMDocument("1.0", "UTF-8");
|
|
|
@ -167,9 +149,6 @@ class Af_Readability extends Plugin {
|
|
|
|
if (!$tmpdoc->loadHTML('<?xml encoding="utf-8" ?>\n' . $tmp))
|
|
|
|
if (!$tmpdoc->loadHTML('<?xml encoding="utf-8" ?>\n' . $tmp))
|
|
|
|
return false;
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
if (!isset($effective_url))
|
|
|
|
|
|
|
|
$effective_url = $url;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (strtolower($tmpdoc->encoding) != 'utf-8') {
|
|
|
|
if (strtolower($tmpdoc->encoding) != 'utf-8') {
|
|
|
|
$tmpxpath = new DOMXPath($tmpdoc);
|
|
|
|
$tmpxpath = new DOMXPath($tmpdoc);
|
|
|
|
|
|
|
|
|
|
|
@ -180,7 +159,7 @@ class Af_Readability extends Plugin {
|
|
|
|
$tmp = $tmpdoc->saveHTML();
|
|
|
|
$tmp = $tmpdoc->saveHTML();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$r = new Readability($tmp, $url);
|
|
|
|
$r = new Readability($tmp, $fetch_effective_url);
|
|
|
|
|
|
|
|
|
|
|
|
if ($r->init()) {
|
|
|
|
if ($r->init()) {
|
|
|
|
$tmpxpath = new DOMXPath($r->dom);
|
|
|
|
$tmpxpath = new DOMXPath($r->dom);
|
|
|
@ -190,13 +169,13 @@ class Af_Readability extends Plugin {
|
|
|
|
foreach ($entries as $entry) {
|
|
|
|
foreach ($entries as $entry) {
|
|
|
|
if ($entry->hasAttribute("href")) {
|
|
|
|
if ($entry->hasAttribute("href")) {
|
|
|
|
$entry->setAttribute("href",
|
|
|
|
$entry->setAttribute("href",
|
|
|
|
rewrite_relative_url($effective_url, $entry->getAttribute("href")));
|
|
|
|
rewrite_relative_url($fetch_effective_url, $entry->getAttribute("href")));
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if ($entry->hasAttribute("src")) {
|
|
|
|
if ($entry->hasAttribute("src")) {
|
|
|
|
$entry->setAttribute("src",
|
|
|
|
$entry->setAttribute("src",
|
|
|
|
rewrite_relative_url($effective_url, $entry->getAttribute("src")));
|
|
|
|
rewrite_relative_url($fetch_effective_url, $entry->getAttribute("src")));
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|