From 63d6e6dfc35e6d82c4a64f37c408794c163becd4 Mon Sep 17 00:00:00 2001 From: thomascube <thomas@roundcube.net> Date: Wed, 28 Sep 2011 15:16:41 -0400 Subject: [PATCH] Bump versions to 0.6 stable --- program/lib/html2text.php | 257 +++++++++++++++++++++++++++++++++++--------------- 1 files changed, 178 insertions(+), 79 deletions(-) diff --git a/program/lib/html2text.php b/program/lib/html2text.php index e2a5b24..1ab1605 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -148,7 +148,6 @@ '/[ ]{2,}/', // Runs of spaces, pre-handling '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with - //'/<!-- .* -->/', // Comments -- which strip_tags might have problem a with '/<p[^>]*>/i', // <P> '/<br[^>]*>/i', // <br> '/<i[^>]*>(.*?)<\/i>/i', // <i> @@ -158,6 +157,7 @@ '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> '/<li[^>]*>/i', // <li> '/<hr[^>]*>/i', // <hr> + '/<div[^>]*>/i', // <div> '/(<table[^>]*>|<\/table>)/i', // <table> and </table> '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> @@ -167,7 +167,6 @@ '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes '/>/i', // Greater-than '/</i', // Less-than - '/&(amp|#38);/i', // Ampersand '/&(copy|#169);/i', // Copyright '/&(trade|#8482|#153);/i', // Trademark '/&(reg|#174);/i', // Registered @@ -176,7 +175,7 @@ '/&(bull|#149|#8226);/i', // Bullet '/&(pound|#163);/i', // Pound sign '/&(euro|#8364);/i', // Euro sign - '/&[^&;]+;/i', // Unknown/unhandled entities + '/&(amp|#38);/i', // Ampersand: see _converter() '/[ ]{2,}/' // Runs of spaces, post-handling ); @@ -193,8 +192,7 @@ ' ', // Runs of spaces, pre-handling '', // <script>s -- which strip_tags supposedly has problems with '', // <style>s -- which strip_tags supposedly has problems with - //'', // Comments -- which strip_tags might have problem a with - "\n\n", // <P> + "\n\n", // <P> "\n", // <br> '_\\1_', // <i> '_\\1_', // <em> @@ -202,8 +200,9 @@ "\n\n", // <ol> and </ol> "\t* \\1\n", // <li> and </li> "\n\t* ", // <li> - "\n-------------------------\n", // <hr> - "\n\n", // <table> and </table> + "\n-------------------------\n", // <hr> + "<div>\n", // <div> + "\n\n", // <table> and </table> "\n", // <tr> and </tr> "\t\t\\1\n", // <td> and </td> ' ', // Non-breaking space @@ -211,7 +210,6 @@ "'", // Single quotes '>', '<', - '&', '(c)', '(tm)', '(R)', @@ -220,7 +218,7 @@ '*', '£', 'EUR', // Euro sign. � ? - '', // Unknown/unhandled entities + '|+|amp|+|', // Ampersand: see _converter() ' ' // Runs of spaces, post-handling ); @@ -249,11 +247,11 @@ * @see $pre_replace */ var $pre_search = array( - "/\n/", - "/\t/", - '/ /', - '/<pre[^>]*>/', - '/<\/pre>/' + "/\n/", + "/\t/", + '/ /', + '/<pre[^>]*>/', + '/<\/pre>/' ); /** @@ -264,11 +262,11 @@ * @see $pre_search */ var $pre_replace = array( - '<br>', - ' ', - ' ', - '', - '' + '<br>', + ' ', + ' ', + '', + '' ); /** @@ -344,10 +342,10 @@ if ( !empty($source) ) { $this->set_html($source, $from_file); } - + $this->set_base_url(); - $this->_do_links = $do_links; - $this->width = $width; + $this->_do_links = $do_links; + $this->width = $width; } /** @@ -361,10 +359,10 @@ function set_html( $source, $from_file = false ) { if ( $from_file && file_exists($source) ) { - $this->html = file_get_contents($source); + $this->html = file_get_contents($source); } else - $this->html = $source; + $this->html = $source; $this->_converted = false; } @@ -447,12 +445,7 @@ } /** - * Workhorse function that does actual conversion. - * - * First performs custom tag replacement specified by $search and - * $replace arrays. Then strips any remaining HTML tags, reduces whitespace - * and newlines to a readable format, and word wraps the text to - * $width characters. + * Workhorse function that does actual conversion (calls _converter() method). * * @access private * @return void @@ -465,15 +458,55 @@ $text = trim(stripslashes($this->html)); - // Convert <PRE> + // Convert HTML to TXT + $this->_converter($text); + + // Add link list + if ( !empty($this->_link_list) ) { + $text .= "\n\nLinks:\n------\n" . $this->_link_list; + } + + $this->text = $text; + + $this->_converted = true; + } + + /** + * Workhorse function that does actual conversion. + * + * First performs custom tag replacement specified by $search and + * $replace arrays. Then strips any remaining HTML tags, reduces whitespace + * and newlines to a readable format, and word wraps the text to + * $width characters. + * + * @param string Reference to HTML content string + * + * @access private + * @return void + */ + function _converter(&$text) + { + // Convert <BLOCKQUOTE> (before PRE!) + $this->_convert_blockquotes($text); + + // Convert <PRE> $this->_convert_pre($text); // Run our defined search-and-replace $text = preg_replace($this->search, $this->replace, $text); + + // Replace known html entities + $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); + + // Run our defined search-and-replace with callback $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text); - // Replace known html entities - $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); + // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) + $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); + + // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities + // This properly handles situation of "&quot;" in input string + $text = str_replace('|+|amp|+|', '&', $text); // Strip any other HTML tags $text = strip_tags($text, $this->allowed_tags); @@ -482,10 +515,8 @@ $text = preg_replace("/\n\s+\n/", "\n\n", $text); $text = preg_replace("/[\n]{3,}/", "\n\n", $text); - // Add link list - if ( !empty($this->_link_list) ) { - $text .= "\n\nLinks:\n------\n" . $this->_link_list; - } + // remove leading empty lines (can be produced by eg. P tag on the beginning) + $text = preg_replace('/^\n+/', '', $text); // Wrap the text to a readable format // for PHP versions >= 4.0.2. Default width is 75 @@ -493,10 +524,6 @@ if ( $this->width > 0 ) { $text = wordwrap($text, $this->width); } - - $this->text = $text; - - $this->_converted = true; } /** @@ -514,20 +541,22 @@ */ function _build_link_list( $link, $display ) { - if ( !$this->_do_links ) return $display; - - if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || - substr($link, 0, 7) == 'mailto:' ) { + if ( !$this->_do_links ) + return $display; + + if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || + substr($link, 0, 7) == 'mailto:' + ) { $this->_link_count++; - $this->_link_list .= "[" . $this->_link_count . "] $link\n"; + $this->_link_list .= '[' . $this->_link_count . "] $link\n"; $additional = ' [' . $this->_link_count . ']'; - } elseif ( substr($link, 0, 11) == 'javascript:' ) { - // Don't count the link; ignore it - $additional = ''; + } elseif ( substr($link, 0, 11) == 'javascript:' ) { + // Don't count the link; ignore it + $additional = ''; // what about href="#anchor" ? } else { $this->_link_count++; - $this->_link_list .= "[" . $this->_link_count . "] " . $this->url; + $this->_link_list .= '[' . $this->_link_count . '] ' . $this->url; if ( substr($link, 0, 1) != '/' ) { $this->_link_list .= '/'; } @@ -537,7 +566,7 @@ return $display . $additional; } - + /** * Helper function for PRE body conversion. * @@ -546,11 +575,73 @@ */ function _convert_pre(&$text) { - while(preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) - { - $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]); - $text = preg_replace('/<pre[^>]*>.*<\/pre>/ismU', '<div><br>' . $result . '<br></div>', $text, 1); - } + // get the content of PRE element + while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { + // convert the content + $this->pre_content = sprintf('<div><br>%s<br></div>', + preg_replace($this->pre_search, $this->pre_replace, $matches[1])); + // replace the content (use callback because content can contain $0 variable) + $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', + array('html2text', '_preg_pre_callback'), $text, 1); + // free memory + $this->pre_content = ''; + } + } + + /** + * Helper function for BLOCKQUOTE body conversion. + * + * @param string HTML content + * @access private + */ + function _convert_blockquotes(&$text) + { + if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { + $level = 0; + $diff = 0; + foreach ($matches[0] as $m) { + if ($m[0][0] == '<' && $m[0][1] == '/') { + $level--; + if ($level < 0) { + $level = 0; // malformed HTML: go to next blockquote + } + else if ($level > 0) { + // skip inner blockquote + } + else { + $end = $m[1]; + $len = $end - $taglen - $start; + // Get blockquote content + $body = substr($text, $start + $taglen - $diff, $len); + + // Set text width + $p_width = $this->width; + if ($this->width > 0) $this->width -= 2; + // Convert blockquote content + $body = trim($body); + $this->_converter($body); + // Add citation markers and create PRE block + $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); + $body = '<pre>' . htmlspecialchars($body) . '</pre>'; + // Re-set text width + $this->width = $p_width; + // Replace content + $text = substr($text, 0, $start - $diff) + . $body . substr($text, $end + strlen($m[0]) - $diff); + + $diff = $len + $taglen + strlen($m[0]) - strlen($body); + unset($body); + } + } + else { + if ($level == 0) { + $start = $m[1]; + $taglen = strlen($m[0]); + } + $level ++; + } + } + } } /** @@ -558,38 +649,46 @@ * * @param array PREG matches * @return string - * @access private */ - function _preg_callback($matches) + private function _preg_callback($matches) { - switch($matches[1]) - { - case 'b': - case 'strong': - return $this->_strtoupper($matches[2]); - case 'hr': - return $this->_strtoupper("\t\t". $matches[2] ."\n"); - case 'h': - return $this->_strtoupper("\n\n". $matches[2] ."\n\n"); - case 'a': - return $this->_build_link_list($matches[3], $matches[4]); - } + switch($matches[1]) { + case 'b': + case 'strong': + return $this->_strtoupper($matches[2]); + case 'th': + return $this->_strtoupper("\t\t". $matches[2] ."\n"); + case 'h': + return $this->_strtoupper("\n\n". $matches[2] ."\n\n"); + case 'a': + // Remove spaces in URL (#1487805) + $url = str_replace(' ', '', $matches[3]); + return $this->_build_link_list($url, $matches[4]); + } } - + + /** + * Callback function for preg_replace_callback use in PRE content handler. + * + * @param array PREG matches + * @return string + */ + private function _preg_pre_callback($matches) + { + return $this->pre_content; + } + /** * Strtoupper multibyte wrapper function * * @param string * @return string - * @access private */ - function _strtoupper($str) + private function _strtoupper($str) { - if (function_exists('mb_strtoupper')) - return mb_strtoupper($str); - else - return strtoupper($str); + if (function_exists('mb_strtoupper')) + return mb_strtoupper($str); + else + return strtoupper($str); } } - -?> -- Gitblit v1.9.1