| | |
| | | '|rgb\(\s*[0-9]+\s*,\s*[0-9]+\s*,\s*[0-9]+\s*\)'. |
| | | '|-?[0-9.]+\s*(em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)?'. |
| | | '|#[0-9a-f]{3,6}'. |
| | | '|[a-z0-9", -]+'. |
| | | '|[a-z0-9"\', -]+'. |
| | | ')\s*/i', $str, $match) |
| | | ) { |
| | | if ($match[2]) { |
| | |
| | | |
| | | /** |
| | | * The main loop that recurse on a node tree. |
| | | * It output only allowed tags with allowed attributes |
| | | * and allowed inline styles |
| | | * It output only allowed tags with allowed attributes and allowed inline styles |
| | | * |
| | | * @param DOMNode $node HTML element |
| | | * @param int $level Recurrence level (safe initial value found empirically) |
| | | */ |
| | | private function dumpHtml($node, $level = 0) |
| | | private function dumpHtml($node, $level = 20) |
| | | { |
| | | if (!$node->hasChildNodes()) { |
| | | return ''; |
| | |
| | | $html = preg_replace($html_search, $html_replace, trim($html)); |
| | | |
| | | //-> Replace all of those weird MS Word quotes and other high characters |
| | | $badwordchars=array( |
| | | $badwordchars = array( |
| | | "\xe2\x80\x98", // left single quote |
| | | "\xe2\x80\x99", // right single quote |
| | | "\xe2\x80\x9c", // left double quote |
| | |
| | | "\xe2\x80\x94", // em dash |
| | | "\xe2\x80\xa6" // elipses |
| | | ); |
| | | $fixedwordchars=array( |
| | | $fixedwordchars = array( |
| | | "'", |
| | | "'", |
| | | '"', |
| | |
| | | '—', |
| | | '...' |
| | | ); |
| | | $html = str_replace($badwordchars,$fixedwordchars, $html); |
| | | $html = str_replace($badwordchars, $fixedwordchars, $html); |
| | | |
| | | // PCRE errors handling (#1486856), should we use something like for every preg_* use? |
| | | if ($html === null && ($preg_error = preg_last_error()) != PREG_NO_ERROR) { |
| | |
| | | } |
| | | |
| | | // fix (unknown/malformed) HTML tags before "wash" |
| | | $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)/', array($this, 'html_tag_callback'), $html); |
| | | $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)([^>]*)/', array($this, 'html_tag_callback'), $html); |
| | | |
| | | // Remove invalid HTML comments (#1487759) |
| | | // Don't remove valid conditional comments |
| | | // Don't remove MSOutlook (<!-->) conditional comments (#1489004) |
| | | $html = preg_replace('/<!--[^->\[\n]+>/', '', $html); |
| | | |
| | | // fix broken nested lists |
| | | self::fix_broken_lists($html); |
| | | |
| | | // turn relative into absolute urls |
| | | $html = self::resolve_base($html); |
| | |
| | | '/[^a-z0-9_\[\]\!-]/i', // forbidden characters |
| | | ), '', $tagname); |
| | | |
| | | return $matches[1] . $tagname; |
| | | // fix invalid closing tags - remove any attributes (#1489446) |
| | | if ($matches[1] == '</') { |
| | | $matches[3] = ''; |
| | | } |
| | | |
| | | return $matches[1] . $tagname . $matches[3]; |
| | | } |
| | | |
| | | /** |
| | |
| | | |
| | | return $body; |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * Fix broken nested lists, they are not handled properly by DOMDocument (#1488768) |
| | | */ |
| | | public static function fix_broken_lists(&$html) |
| | | { |
| | | // do two rounds, one for <ol>, one for <ul> |
| | | foreach (array('ol', 'ul') as $tag) { |
| | | $pos = 0; |
| | | while (($pos = stripos($html, '<' . $tag, $pos)) !== false) { |
| | | $pos++; |
| | | |
| | | // make sure this is an ol/ul tag |
| | | if (!in_array($html[$pos+2], array(' ', '>'))) { |
| | | continue; |
| | | } |
| | | |
| | | $p = $pos; |
| | | $in_li = false; |
| | | $li_pos = 0; |
| | | |
| | | while (($p = strpos($html, '<', $p)) !== false) { |
| | | $tt = strtolower(substr($html, $p, 4)); |
| | | |
| | | // li open tag |
| | | if ($tt == '<li>' || $tt == '<li ') { |
| | | $in_li = true; |
| | | $p += 4; |
| | | } |
| | | // li close tag |
| | | else if ($tt == '</li' && in_array($html[$p+4], array(' ', '>'))) { |
| | | $li_pos = $p; |
| | | $p += 4; |
| | | $in_li = false; |
| | | } |
| | | // ul/ol closing tag |
| | | else if ($tt == '</' . $tag && in_array($html[$p+4], array(' ', '>'))) { |
| | | break; |
| | | } |
| | | // nested ol/ul element out of li |
| | | else if (!$in_li && $li_pos && ($tt == '<ol>' || $tt == '<ol ' || $tt == '<ul>' || $tt == '<ul ')) { |
| | | // find closing tag of this ul/ol element |
| | | $element = substr($tt, 1, 2); |
| | | $cpos = $p; |
| | | do { |
| | | $tpos = stripos($html, '<' . $element, $cpos+1); |
| | | $cpos = stripos($html, '</' . $element, $cpos+1); |
| | | } |
| | | while ($tpos !== false && $cpos !== false && $cpos > $tpos); |
| | | |
| | | // not found, this is invalid HTML, skip it |
| | | if ($cpos === false) { |
| | | break; |
| | | } |
| | | |
| | | // get element content |
| | | $end = strpos($html, '>', $cpos); |
| | | $len = $end - $p + 1; |
| | | $element = substr($html, $p, $len); |
| | | |
| | | // move element to the end of the last li |
| | | $html = substr_replace($html, '', $p, $len); |
| | | $html = substr_replace($html, $element, $li_pos, 0); |
| | | |
| | | $p = $end; |
| | | } |
| | | else { |
| | | $p++; |
| | | } |
| | | } |
| | | } |
| | | } |
| | | } |
| | | } |