From 2a479dced03735c9e6062bf0366e2774ca253300 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 21 Aug 2007 15:15:50 +0100 Subject: [PATCH] rework feed content mangling algorithm --- magpierss/rss_fetch.inc | 30 +--------------------- magpierss/rss_parse.inc | 57 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/magpierss/rss_fetch.inc b/magpierss/rss_fetch.inc index 77a1a704f..126dc630a 100644 --- a/magpierss/rss_fetch.inc +++ b/magpierss/rss_fetch.inc @@ -279,33 +279,6 @@ function _fetch_remote_file ($url, $headers = "" ) { } -function _convert_entities ($string) { - # Source: http://www.w3.org/TR/REC-html40/sgml/entities.html - $html_entities = array( - " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", - "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", - "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", - "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", - "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", - "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", - "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", - "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", - "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", - "ú", "û", "ü", "ý", "þ", "ÿ",); - $numeric_entities = array( - " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", - "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", - "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", - "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", - "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", - "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", - "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", - "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", - "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", - "ú", "û", "ü", "ý", "þ", "ÿ"); - return str_replace($html_entities, $numeric_entities, $string); -} - /*=======================================================================*\ Function: _response_to_rss Purpose: parse an HTTP response object into an RSS object @@ -313,8 +286,7 @@ function _convert_entities ($string) { Output: parsed RSS object (see rss_parse) \*=======================================================================*/ function _response_to_rss ($resp) { - $converted_source = _convert_entities($resp->results); - $rss = new MagpieRSS( $converted_source, MAGPIE_OUTPUT_ENCODING, "UTF-8", false); + $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, "UTF-8", false); // if RSS parsed successfully if ( $rss and !$rss->ERROR) { diff --git a/magpierss/rss_parse.inc b/magpierss/rss_parse.inc index 66e5e65f2..3aff57a50 100644 --- a/magpierss/rss_parse.inc +++ b/magpierss/rss_parse.inc @@ -23,6 +23,35 @@ define('RSS', 'RSS'); define('ATOM', 'Atom'); +function _convert_entities ($string) { + # Source: http://www.w3.org/TR/REC-html40/sgml/entities.html + $html_entities = array( + " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", + "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", + "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", + "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", + "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", + "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", + "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", + "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", + "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", + "ú", "û", "ü", "ý", "þ", "ÿ",); + $numeric_entities = array( + " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", + "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", + "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", + "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", + "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", + "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", + "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", + "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", + "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", + "ú", "û", "ü", "ý", "þ", "ÿ"); + return str_replace($html_entities, $numeric_entities, $string); +} + + + require_once (MAGPIE_DIR . 'rss_utils.inc'); /** @@ -149,12 +178,14 @@ class MagpieRSS { $enc = mb_detect_encoding($string); } + # try fix XML, pass 1 + $source = mb_convert_encoding($source, "UTF-8", $enc); list($parser, $source) = $this->create_parser($source, $output_encoding, $input_encoding, $detect_encoding); - $this->parser = $parser; + $this->parser = $parser; xml_set_object( $this->parser, $this ); xml_set_element_handler($this->parser, @@ -163,6 +194,30 @@ class MagpieRSS { xml_set_character_data_handler( $this->parser, 'feed_cdata' ); $status = xml_parse( $this->parser, $source); + + # try to fix XML, pass 2 + + if (! $status) { + $errorcode = xml_get_error_code( $this->parser ); + if ( $errorcode != XML_ERROR_NONE ) { + + $source = _convert_entities($source); + + list($parser, $source) = $this->create_parser($source, + $output_encoding, $input_encoding, $detect_encoding); + + $this->parser = $parser; + + xml_set_object( $this->parser, $this ); + xml_set_element_handler($this->parser, + 'feed_start_element', 'feed_end_element' ); + + xml_set_character_data_handler( $this->parser, 'feed_cdata' ); + + $status = xml_parse( $this->parser, $source); + + } + } } }