Fix rcube_utils::normalize_string() to support unicode characters + add argument for minimum token length

pull/244/head
Thomas Bruederli 10 years ago
parent 09c58d1add
commit e8b82c2e7b

@ -912,14 +912,21 @@ class rcube_utils
* Split the given string into word tokens
*
* @param string Input to tokenize
* @param integer Minimum length of a single token
* @return array List of tokens
*/
public static function tokenize_string($str)
public static function tokenize_string($str, $minlen = 2)
{
return explode(" ", preg_replace(
array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'),
array(' ', '\\1\\2', ' '),
$str));
$expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u');
$repl = array(' ', '\\1\\2');
if ($minlen > 1) {
$minlen--;
$expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u";
$repl[] = ' ';
}
return array_filter(explode(" ", preg_replace($expr, $repl, $str)));
}
/**
@ -928,10 +935,11 @@ class rcube_utils
*
* @param string Input string (UTF-8)
* @param boolean True to return list of words as array
* @param integer Minimum length of tokens
*
* @return mixed Normalized string or a list of normalized tokens
*/
public static function normalize_string($str, $as_array = false)
public static function normalize_string($str, $as_array = false, $minlen = 2)
{
// replace 4-byte unicode characters with '?' character,
// these are not supported in default utf-8 charset on mysql,
@ -943,7 +951,7 @@ class rcube_utils
. ')/', '?', $str);
// split by words
$arr = self::tokenize_string($str);
$arr = self::tokenize_string($str, $minlen);
// detect character set
if (utf8_encode(utf8_decode($str)) == $str) {

@ -343,6 +343,25 @@ class Framework_Utils extends PHPUnit_Framework_TestCase
}
}
/**
* rcube:utils::tokenize_string()
*/
function test_tokenize_string()
{
$test = array(
'' => array(),
'abc d' => array('abc'),
'abc de' => array('abc','de'),
'äàé;êöü-xyz' => array('äàé','êöü','xyz'),
'日期格式' => array('日期格式'),
);
foreach ($test as $input => $output) {
$result = rcube_utils::tokenize_string($input);
$this->assertSame($output, $result);
}
}
/**
* rcube:utils::normalize_string()
*/
@ -353,15 +372,18 @@ class Framework_Utils extends PHPUnit_Framework_TestCase
'abc def' => 'abc def',
'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy',
'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc',
'ß' => 'ss',
'ae' => 'a',
'oe' => 'o',
'ue' => 'u',
'ß' => '',
'ßs' => 'sss',
'Xae' => 'xa',
'Xoe' => 'xo',
'Xue' => 'xu',
'项目' => '项目',
'日' => '', // FIXME: this should not be stripped although minlen = 2
);
foreach ($test as $input => $output) {
$result = rcube_utils::normalize_string($input);
$this->assertSame($output, $result);
$this->assertSame($output, $result, "Error normalizing '$input'");
}
}

Loading…
Cancel
Save