1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
/**
* 将 %u4F19 转换成如 伙 的 HTML Entity 的形式
*
* @param mixed $str
* @access public
* @return void
*/
function convert_entities($str) {
$str = preg_replace_callback('|%u([a-f0-9]{4})|i',
create_function(
'$matches',
'return \'&#\' . hexdec($matches[1]) . \';\';'
),
$str
);
return $str;
}
/**
* 把 HTML Entity 转换为原始字符
*
* @param mixed $source
* @access public
* @return void
*/
function utf8encode($source) {
$utf8str = '';
$entities = explode('&#', $source);
$size = count($entities);
for ($i = 0; $i < $size; $i++) {
$foo = $entities[$i];
$nonEntity = strstr($foo, ';');
if ($nonEntity !== false) {
$unicode = intval(substr($foo, 0, (strpos($foo, ';') + 1)));
// determine how many chars are needed to reprsent this unicode char
if ($unicode < 128) {
$bar = chr($unicode);
}
else if ($unicode >= 128 and $unicode < 2048) {
$binVal = str_pad(decbin($unicode), 11, '0', STR_PAD_LEFT);
$binPart1 = substr($binVal, 0, 5);
$binPart2 = substr($binVal, 5);
$char1 = chr(192 + bindec($binPart1));
$char2 = chr(128 + bindec($binPart2));
$bar = $char1 . $char2;
}
else if ($unicode >= 2048 and $unicode < 65536) {
$binVal = str_pad(decbin ($unicode), 16, '0', STR_PAD_LEFT);
$binPart1 = substr($binVal, 0, 4);
$binPart2 = substr($binVal, 4, 6);
$binPart3 = substr($binVal, 10);
$char1 = chr(224 + bindec($binPart1));
$char2 = chr(128 + bindec($binPart2));
$char3 = chr(128 + bindec($binPart3));
$bar = $char1 . $char2 . $char3;
}
else {
$binVal = str_pad(decbin($unicode), 21, '0', STR_PAD_LEFT);
$binPart1 = substr($binVal, 0, 3);
$binPart2 = substr($binVal, 3, 6);
$binPart3 = substr($binVal, 9, 6);
$binPart4 = substr($binVal, 15);
$char1 = chr(240 + bindec($binPart1));
$char2 = chr(128 + bindec($binPart2));
$char3 = chr(128 + bindec($binPart3));
$char4 = chr(128 + bindec($binPart4));
$bar = $char1 . $char2 . $char3 . $char4;
}
if (strlen($nonEntity) > 1) {
$nonEntity = substr($nonEntity, 1); // chop the first char (';')
}
else {
$nonEntity = '';
}
$utf8str .= $bar . $nonEntity;
}
else {
$utf8str .= $foo;
}
}
return $utf8str;
}
$str = '%u4F19%u8BA1';
$str = convert_entities($str);
print utf8encode($str);
// output: 伙计 |