From d19c0f9f309cbe63411a8ddcbbda3daf7461a30d Mon Sep 17 00:00:00 2001 From: Aleksander Machniak <alec@alec.pl> Date: Thu, 12 Dec 2013 02:58:54 -0500 Subject: [PATCH] In normalize_string() replace 4-byte unicode characters with '?' character. These are not supported in default utf-8 charset on mysql, the chance we'd need them in searching is very low. --- program/lib/Roundcube/rcube_utils.php | 112 +++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 74 insertions(+), 38 deletions(-) diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index c1ad382..db41a6e 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -445,34 +445,41 @@ $source = self::xss_entity_decode($source); $stripped = preg_replace('/[^a-z\(:;]/i', '', $source); $evilexpr = 'expression|behavior|javascript:|import[^a]' . (!$allow_remote ? '|url\(' : ''); + if (preg_match("/$evilexpr/i", $stripped)) { return '/* evil! */'; } + $strict_url_regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims'; + // cut out all contents between { and } while (($pos = strpos($source, '{', $last_pos)) && ($pos2 = strpos($source, '}', $pos))) { - $styles = substr($source, $pos+1, $pos2-($pos+1)); + $nested = strpos($source, '{', $pos+1); + if ($nested && $nested < $pos2) // when dealing with nested blocks (e.g. @media), take the inner one + $pos = $nested; + $length = $pos2 - $pos - 1; + $styles = substr($source, $pos+1, $length); // check every line of a style block... if ($allow_remote) { $a_styles = preg_split('/;[\r\n]*/', $styles, -1, PREG_SPLIT_NO_EMPTY); + foreach ($a_styles as $line) { $stripped = preg_replace('/[^a-z\(:;]/i', '', $line); // ... and only allow strict url() values - $regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims'; - if (stripos($stripped, 'url(') && !preg_match($regexp, $line)) { + if (stripos($stripped, 'url(') && !preg_match($strict_url_regexp, $line)) { $a_styles = array('/* evil! */'); break; } } + $styles = join(";\n", $a_styles); } - $key = $replacements->add($styles); - $source = substr($source, 0, $pos+1) - . $replacements->get_replacement($key) - . substr($source, $pos2, strlen($source)-$pos2); - $last_pos = $pos+2; + $key = $replacements->add($styles); + $repl = $replacements->get_replacement($key); + $source = substr_replace($source, $repl, $pos+1, $length); + $last_pos = $pos2 - ($length - strlen($repl)); } // remove html comments and add #container to each tag selector. @@ -740,39 +747,12 @@ */ public static function strtotime($date) { - $date = trim($date); - - // check for MS Outlook vCard date format YYYYMMDD - if (preg_match('/^([12][90]\d\d)([01]\d)([0123]\d)$/', $date, $m)) { - return mktime(0,0,0, intval($m[2]), intval($m[3]), intval($m[1])); - } - - // common little-endian formats, e.g. dd/mm/yyyy (not all are supported by strtotime) - if (preg_match('/^(\d{1,2})[.\/-](\d{1,2})[.\/-](\d{4})$/', $date, $m) - && $m[1] > 0 && $m[1] <= 31 && $m[2] > 0 && $m[2] <= 12 && $m[3] >= 1970 - ) { - return mktime(0,0,0, intval($m[2]), intval($m[1]), intval($m[3])); - } + $date = self::clean_datestr($date); // unix timestamp if (is_numeric($date)) { return (int) $date; } - - // Clean malformed data - $date = preg_replace( - array( - '/GMT\s*([+-][0-9]+)/', // support non-standard "GMTXXXX" literal - '/[^a-z0-9\x20\x09:+-]/i', // remove any invalid characters - '/\s*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s*/i', // remove weekday names - ), - array( - '\\1', - '', - '', - ), $date); - - $date = trim($date); // if date parsing fails, we have a date in non-rfc format. // remove token from the end and try again @@ -801,8 +781,8 @@ return $date; } - $dt = false; - $date = trim($date); + $dt = false; + $date = self::clean_datestr($date); // try to parse string with DateTime first if (!empty($date)) { @@ -825,6 +805,52 @@ } return $dt; + } + + /** + * Clean up date string for strtotime() input + * + * @param string $date Date string + * + * @return string Date string + */ + public static function clean_datestr($date) + { + $date = trim($date); + + // check for MS Outlook vCard date format YYYYMMDD + if (preg_match('/^([12][90]\d\d)([01]\d)([0123]\d)$/', $date, $m)) { + return sprintf('%04d-%02d-%02d 00:00:00', intval($m[1]), intval($m[2]), intval($m[3])); + } + + // Clean malformed data + $date = preg_replace( + array( + '/GMT\s*([+-][0-9]+)/', // support non-standard "GMTXXXX" literal + '/[^a-z0-9\x20\x09:+-\/]/i', // remove any invalid characters + '/\s*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s*/i', // remove weekday names + ), + array( + '\\1', + '', + '', + ), $date); + + $date = trim($date); + + // try to fix dd/mm vs. mm/dd discrepancy, we can't do more here + if (preg_match('/^(\d{1,2})[.\/-](\d{1,2})[.\/-](\d{4})$/', $date, $m)) { + $mdy = $m[2] > 12 && $m[1] <= 12; + $day = $mdy ? $m[2] : $m[1]; + $month = $mdy ? $m[1] : $m[2]; + $date = sprintf('%04d-%02d-%02d 00:00:00', intval($m[3]), $month, $day); + } + // I've found that YYYY.MM.DD is recognized wrong, so here's a fix + else if (preg_match('/^(\d{4})\.(\d{1,2})\.(\d{1,2})$/', $date)) { + $date = str_replace('.', '-', $date) . ' 00:00:00'; + } + + return $date; } /* @@ -886,10 +912,20 @@ * * @param string Input string (UTF-8) * @param boolean True to return list of words as array + * * @return mixed Normalized string or a list of normalized tokens */ public static function normalize_string($str, $as_array = false) { + // replace 4-byte unicode characters with '?' character, + // these are not supported in default utf-8 charset on mysql, + // the chance we'd need them in searching is very low + $str = preg_replace('/(' + . '\xF0[\x90-\xBF][\x80-\xBF]{2}' + . '|[\xF1-\xF3][\x80-\xBF]{3}' + . '|\xF4[\x80-\x8F][\x80-\xBF]{2}' + . ')/', '?', $str); + // split by words $arr = self::tokenize_string($str); -- Gitblit v1.9.1