From 22a866edb5ea406bbd30ca777b58099ce9f55d1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20Kapp=C3=A9?= Date: Sun, 12 Aug 2018 14:54:13 +0100 Subject: [PATCH 1/3] Store language of entries as indicated by the feed. --- classes/feeditem.php | 1 + classes/feeditem/atom.php | 9 +++++++++ classes/feeditem/rss.php | 12 +++++++++++- classes/feedparser.php | 2 +- classes/rssutils.php | 5 +++-- 5 files changed, 25 insertions(+), 4 deletions(-) diff --git a/classes/feeditem.php b/classes/feeditem.php index 594f56984..3a5e5dc09 100644 --- a/classes/feeditem.php +++ b/classes/feeditem.php @@ -11,5 +11,6 @@ abstract class FeedItem { abstract function get_categories(); abstract function get_enclosures(); abstract function get_author(); + abstract function get_language(); } diff --git a/classes/feeditem/atom.php b/classes/feeditem/atom.php index 77cd448b8..6e7a904f8 100644 --- a/classes/feeditem/atom.php +++ b/classes/feeditem/atom.php @@ -197,4 +197,13 @@ class FeedItem_Atom extends FeedItem_Common { return $encs; } + function get_language() { + $elem = $this->elem; + do { + $lang = $elem->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang"); + $elem = $elem->parentNode; + } while (empty($lang) && $elem instanceof DOMElement); + + return $lang; + } } diff --git a/classes/feeditem/rss.php b/classes/feeditem/rss.php index a3fa7e636..dca125be6 100644 --- a/classes/feeditem/rss.php +++ b/classes/feeditem/rss.php @@ -189,4 +189,14 @@ class FeedItem_RSS extends FeedItem_Common { return $encs; } -} \ No newline at end of file + function get_language() { + $languages = $this->doc->getElementsByTagName('language'); + + if (count($languages) == 0) { + return ""; + } + + return $languages[0]->textContent; + } + +} diff --git a/classes/feedparser.php b/classes/feedparser.php index 860ebd73f..a5e406149 100644 --- a/classes/feedparser.php +++ b/classes/feedparser.php @@ -283,4 +283,4 @@ class FeedParser { return $rv; } -} \ No newline at end of file +} diff --git a/classes/rssutils.php b/classes/rssutils.php index b69bb25a0..6fa1e9f4f 100755 --- a/classes/rssutils.php +++ b/classes/rssutils.php @@ -637,8 +637,11 @@ class RSSUtils { $entry_link = rewrite_relative_url($site_url, $item->get_link()); + $entry_language = $item->get_language(); + _debug("title $entry_title", $debug_enabled); _debug("link $entry_link", $debug_enabled); + _debug("language $entry_language", $debug_enabled); if (!$entry_title) $entry_title = date("Y-m-d H:i:s", $entry_timestamp);; @@ -694,7 +697,6 @@ class RSSUtils { $base_entry_id = $row["id"]; $entry_stored_hash = $row["content_hash"]; $article_labels = Article::get_article_labels($base_entry_id, $owner_uid); - $entry_language = $row["lang"]; $existing_tags = Article::get_article_tags($base_entry_id, $owner_uid); $entry_tags = array_unique(array_merge($entry_tags, $existing_tags)); @@ -702,7 +704,6 @@ class RSSUtils { $base_entry_id = false; $entry_stored_hash = ""; $article_labels = array(); - $entry_language = ""; } $article = array("owner_uid" => $owner_uid, // read only From 3bbaf902abe70e28645dcd1bba20705e8db4676b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20Kapp=C3=A9?= Date: Sun, 12 Aug 2018 16:12:34 +0100 Subject: [PATCH 2/3] Sanitize language obtained for an entry. --- classes/rssutils.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classes/rssutils.php b/classes/rssutils.php index 6fa1e9f4f..af5fd057c 100755 --- a/classes/rssutils.php +++ b/classes/rssutils.php @@ -637,7 +637,7 @@ class RSSUtils { $entry_link = rewrite_relative_url($site_url, $item->get_link()); - $entry_language = $item->get_language(); + $entry_language = mb_substr(trim($item->get_language()), 0, 2); _debug("title $entry_title", $debug_enabled); _debug("link $entry_link", $debug_enabled); From a377d5c981471f86c0d0c121be799aa0868e220e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20Kapp=C3=A9?= Date: Sun, 12 Aug 2018 16:17:13 +0100 Subject: [PATCH 3/3] Determine language for atom entry without a loop. --- classes/feeditem/atom.php | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/classes/feeditem/atom.php b/classes/feeditem/atom.php index 6e7a904f8..ee5591757 100644 --- a/classes/feeditem/atom.php +++ b/classes/feeditem/atom.php @@ -1,5 +1,6 @@ elem->getElementsByTagName("id")->item(0); @@ -198,12 +199,13 @@ class FeedItem_Atom extends FeedItem_Common { } function get_language() { - $elem = $this->elem; - do { - $lang = $elem->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang"); - $elem = $elem->parentNode; - } while (empty($lang) && $elem instanceof DOMElement); + $lang = $this->elem->getAttributeNS(self::NS_XML, "lang"); - return $lang; + if (!empty($lang)) { + return $lang; + } else { + // Fall back to the language declared on the feed, if any. + return $this->doc->firstChild->getAttributeNS(self::NS_XML, "lang"); + } } }