From 2f8b1036da42ec3d15a51c6b17a473f9f4df71d3 Mon Sep 17 00:00:00 2001 From: Thomas Bruederli <bruederli@kolabsys.com> Date: Sat, 07 Feb 2015 12:33:24 -0500 Subject: [PATCH] Bump version and copyright year --- program/lib/Roundcube/rcube_utils.php | 22 +++++++++++++++------- 1 files changed, 15 insertions(+), 7 deletions(-) diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index 2e4aa32..f4c0e90 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -912,14 +912,21 @@ * Split the given string into word tokens * * @param string Input to tokenize + * @param integer Minimum length of a single token * @return array List of tokens */ - public static function tokenize_string($str) + public static function tokenize_string($str, $minlen = 2) { - return explode(" ", preg_replace( - array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'), - array(' ', '\\1\\2', ' '), - $str)); + $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u'); + $repl = array(' ', '\\1\\2'); + + if ($minlen > 1) { + $minlen--; + $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u"; + $repl[] = ' '; + } + + return array_filter(explode(" ", preg_replace($expr, $repl, $str))); } /** @@ -928,10 +935,11 @@ * * @param string Input string (UTF-8) * @param boolean True to return list of words as array + * @param integer Minimum length of tokens * * @return mixed Normalized string or a list of normalized tokens */ - public static function normalize_string($str, $as_array = false) + public static function normalize_string($str, $as_array = false, $minlen = 2) { // replace 4-byte unicode characters with '?' character, // these are not supported in default utf-8 charset on mysql, @@ -943,7 +951,7 @@ . ')/', '?', $str); // split by words - $arr = self::tokenize_string($str); + $arr = self::tokenize_string($str, $minlen); // detect character set if (utf8_encode(utf8_decode($str)) == $str) { -- Gitblit v1.9.1