Unicode encode :)

Jul 8th, 2007 | Filed under PHP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
 
/**
 * 将 %u4F19 转换成如 伙 的 HTML Entity 的形式
 *
 * @param mixed $str
 * @access public
 * @return void
 */
function convert_entities($str) {
    $str = preg_replace_callback('|%u([a-f0-9]{4})|i',
        create_function(
            '$matches',
            'return \'&#\' . hexdec($matches[1]) . \';\';'
        ),
        $str
    );
    return $str;
}
 
 
/**
 * 把 HTML Entity 转换为原始字符
 *
 * @param mixed $source
 * @access public
 * @return void
 */
function utf8encode($source) {
    $utf8str  = '';
    $entities = explode('&#', $source);
    $size     = count($entities);
 
    for ($i = 0; $i < $size; $i++) {
        $foo       = $entities[$i];
        $nonEntity = strstr($foo, ';');
 
        if ($nonEntity !== false) {
            $unicode = intval(substr($foo, 0, (strpos($foo, ';') + 1)));
            // determine how many chars are needed to reprsent this unicode char
            if ($unicode < 128) {
                $bar = chr($unicode);
            }
            else if ($unicode >= 128 and $unicode < 2048) {
                $binVal   = str_pad(decbin($unicode), 11, '0', STR_PAD_LEFT);
                $binPart1 = substr($binVal, 0, 5);
                $binPart2 = substr($binVal, 5);
 
                $char1 = chr(192 + bindec($binPart1));
                $char2 = chr(128 + bindec($binPart2));
                $bar   = $char1 . $char2;
            }
            else if ($unicode >= 2048 and $unicode < 65536) {
                $binVal   = str_pad(decbin ($unicode), 16, '0', STR_PAD_LEFT);
                $binPart1 = substr($binVal, 0, 4);
                $binPart2 = substr($binVal, 4, 6);
                $binPart3 = substr($binVal, 10);
 
                $char1 = chr(224 + bindec($binPart1));
                $char2 = chr(128 + bindec($binPart2));
                $char3 = chr(128 + bindec($binPart3));
                $bar   = $char1 . $char2 . $char3;
            }
            else {
                $binVal   = str_pad(decbin($unicode), 21, '0', STR_PAD_LEFT);
                $binPart1 = substr($binVal, 0, 3);
                $binPart2 = substr($binVal, 3, 6);
                $binPart3 = substr($binVal, 9, 6);
                $binPart4 = substr($binVal, 15);
 
                $char1 = chr(240 + bindec($binPart1));
                $char2 = chr(128 + bindec($binPart2));
                $char3 = chr(128 + bindec($binPart3));
                $char4 = chr(128 + bindec($binPart4));
                $bar   = $char1 . $char2 . $char3 . $char4;
            }
 
            if (strlen($nonEntity) > 1) {
                $nonEntity = substr($nonEntity, 1); // chop the first char (';')
            }
            else {
                $nonEntity = '';
            }
            $utf8str .= $bar . $nonEntity;
        }
        else {
            $utf8str .= $foo;
        }
    }
    return $utf8str;
}
 
$str = '%u4F19%u8BA1';
$str = convert_entities($str);
print utf8encode($str);
// output: 伙计
Tags: ,
Comments are closed.