From d19c0f9f309cbe63411a8ddcbbda3daf7461a30d Mon Sep 17 00:00:00 2001
From: Aleksander Machniak <alec@alec.pl>
Date: Thu, 12 Dec 2013 02:58:54 -0500
Subject: [PATCH] In normalize_string() replace 4-byte unicode characters with '?' character. These are not supported in default utf-8 charset on mysql, the chance we'd need them in searching is very low.

---
 program/lib/Roundcube/rcube_utils.php |  112 +++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php
index c1ad382..db41a6e 100644
--- a/program/lib/Roundcube/rcube_utils.php
+++ b/program/lib/Roundcube/rcube_utils.php
@@ -445,34 +445,41 @@
         $source   = self::xss_entity_decode($source);
         $stripped = preg_replace('/[^a-z\(:;]/i', '', $source);
         $evilexpr = 'expression|behavior|javascript:|import[^a]' . (!$allow_remote ? '|url\(' : '');
+
         if (preg_match("/$evilexpr/i", $stripped)) {
             return '/* evil! */';
         }
 
+        $strict_url_regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims';
+
         // cut out all contents between { and }
         while (($pos = strpos($source, '{', $last_pos)) && ($pos2 = strpos($source, '}', $pos))) {
-            $styles = substr($source, $pos+1, $pos2-($pos+1));
+            $nested = strpos($source, '{', $pos+1);
+            if ($nested && $nested < $pos2)  // when dealing with nested blocks (e.g. @media), take the inner one
+                $pos = $nested;
+            $length = $pos2 - $pos - 1;
+            $styles = substr($source, $pos+1, $length);
 
             // check every line of a style block...
             if ($allow_remote) {
                 $a_styles = preg_split('/;[\r\n]*/', $styles, -1, PREG_SPLIT_NO_EMPTY);
+
                 foreach ($a_styles as $line) {
                     $stripped = preg_replace('/[^a-z\(:;]/i', '', $line);
                     // ... and only allow strict url() values
-                    $regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims';
-                    if (stripos($stripped, 'url(') && !preg_match($regexp, $line)) {
+                    if (stripos($stripped, 'url(') && !preg_match($strict_url_regexp, $line)) {
                         $a_styles = array('/* evil! */');
                         break;
                     }
                 }
+
                 $styles = join(";\n", $a_styles);
             }
 
-            $key = $replacements->add($styles);
-            $source = substr($source, 0, $pos+1)
-                . $replacements->get_replacement($key)
-                . substr($source, $pos2, strlen($source)-$pos2);
-            $last_pos = $pos+2;
+            $key      = $replacements->add($styles);
+            $repl     = $replacements->get_replacement($key);
+            $source   = substr_replace($source, $repl, $pos+1, $length);
+            $last_pos = $pos2 - ($length - strlen($repl));
         }
 
         // remove html comments and add #container to each tag selector.
@@ -740,39 +747,12 @@
      */
     public static function strtotime($date)
     {
-        $date = trim($date);
-
-        // check for MS Outlook vCard date format YYYYMMDD
-        if (preg_match('/^([12][90]\d\d)([01]\d)([0123]\d)$/', $date, $m)) {
-            return mktime(0,0,0, intval($m[2]), intval($m[3]), intval($m[1]));
-        }
-
-        // common little-endian formats, e.g. dd/mm/yyyy (not all are supported by strtotime)
-        if (preg_match('/^(\d{1,2})[.\/-](\d{1,2})[.\/-](\d{4})$/', $date, $m)
-            && $m[1] > 0 && $m[1] <= 31 && $m[2] > 0 && $m[2] <= 12 && $m[3] >= 1970
-        ) {
-            return mktime(0,0,0, intval($m[2]), intval($m[1]), intval($m[3]));
-        }
+        $date = self::clean_datestr($date);
 
         // unix timestamp
         if (is_numeric($date)) {
             return (int) $date;
         }
-
-        // Clean malformed data
-        $date = preg_replace(
-            array(
-                '/GMT\s*([+-][0-9]+)/',                     // support non-standard "GMTXXXX" literal
-                '/[^a-z0-9\x20\x09:+-]/i',                  // remove any invalid characters
-                '/\s*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s*/i',   // remove weekday names
-            ),
-            array(
-                '\\1',
-                '',
-                '',
-            ), $date);
-
-        $date = trim($date);
 
         // if date parsing fails, we have a date in non-rfc format.
         // remove token from the end and try again
@@ -801,8 +781,8 @@
             return $date;
         }
 
-        $dt = false;
-        $date = trim($date);
+        $dt   = false;
+        $date = self::clean_datestr($date);
 
         // try to parse string with DateTime first
         if (!empty($date)) {
@@ -825,6 +805,52 @@
         }
 
         return $dt;
+    }
+
+    /**
+     * Clean up date string for strtotime() input
+     *
+     * @param string $date Date string
+     *
+     * @return string Date string
+     */
+    public static function clean_datestr($date)
+    {
+        $date = trim($date);
+
+        // check for MS Outlook vCard date format YYYYMMDD
+        if (preg_match('/^([12][90]\d\d)([01]\d)([0123]\d)$/', $date, $m)) {
+            return sprintf('%04d-%02d-%02d 00:00:00', intval($m[1]), intval($m[2]), intval($m[3]));
+        }
+
+        // Clean malformed data
+        $date = preg_replace(
+            array(
+                '/GMT\s*([+-][0-9]+)/',                     // support non-standard "GMTXXXX" literal
+                '/[^a-z0-9\x20\x09:+-\/]/i',                  // remove any invalid characters
+                '/\s*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s*/i',   // remove weekday names
+            ),
+            array(
+                '\\1',
+                '',
+                '',
+            ), $date);
+
+        $date = trim($date);
+
+        // try to fix dd/mm vs. mm/dd discrepancy, we can't do more here
+        if (preg_match('/^(\d{1,2})[.\/-](\d{1,2})[.\/-](\d{4})$/', $date, $m)) {
+            $mdy   = $m[2] > 12 && $m[1] <= 12;
+            $day   = $mdy ? $m[2] : $m[1];
+            $month = $mdy ? $m[1] : $m[2];
+            $date  = sprintf('%04d-%02d-%02d 00:00:00', intval($m[3]), $month, $day);
+        }
+        // I've found that YYYY.MM.DD is recognized wrong, so here's a fix
+        else if (preg_match('/^(\d{4})\.(\d{1,2})\.(\d{1,2})$/', $date)) {
+            $date = str_replace('.', '-', $date) . ' 00:00:00';
+        }
+
+        return $date;
     }
 
     /*
@@ -886,10 +912,20 @@
      *
      * @param string  Input string (UTF-8)
      * @param boolean True to return list of words as array
+     *
      * @return mixed  Normalized string or a list of normalized tokens
      */
     public static function normalize_string($str, $as_array = false)
     {
+        // replace 4-byte unicode characters with '?' character,
+        // these are not supported in default utf-8 charset on mysql,
+        // the chance we'd need them in searching is very low
+        $str = preg_replace('/('
+            . '\xF0[\x90-\xBF][\x80-\xBF]{2}'
+            . '|[\xF1-\xF3][\x80-\xBF]{3}'
+            . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'
+            . ')/', '?', $str);
+
         // split by words
         $arr = self::tokenize_string($str);
 

--
Gitblit v1.9.1