php2个方法分析

今天研究了半天终于搞明白UTF-8和Unicode互转。另：javascript中的escape，把汉字转换后的unicode编码中的数字是16进制的。比如：escape(“文”) = %u6587，如果在html文件中使用，可以写成【&# x6587】，或者将6587转换为十进制25991，即【&# 25991】。第一个： utf8_uri_encode，将UTF-8编码的文字转换为%ae%3d格式，比如，“中”，解析后，是3字节编码的，依次是228(e4)、184(b8)、173(ad)。最后输出：%e4%b8%ad

/** * Conver utf-8 words to $hex format string * @author wordpress * @param $utf8_string string to encoded using utf-8 [STRING] * @return string of utf8 encode entities [STRING] * @access public */ function utf8_uri_encode( $utf8_string ) { $unicode = ‘’; $values = array(); $num_octets = 1; for ($i = 0; $i < strlen( $utf8_string ); $i++ ) { $value = ord( $utf8_string[ $i ] ); if ( $value < 128 ) { // ASCII $unicode .= chr($value); } else { // 根据第一个字节开头的1的数量,判断当前文字是2字节编码还是3字节编码 // 如果 240=< $value <= 248,则是4字节编码，这里没有考虑，估计也用不上 if ( count( $values ) == 0 ) { $num_octets = ( $value < 224 ) ? 2 : 3; } $values[] = $value; // 当存放当前文字的字节数组等于当前文字的字节数时，则输入，并重置数组 if ( count( $values ) == $num_octets ) { if ($num_octets == 3) { $unicode .= ‘%’ . dechex($values[0]) . ‘%’ . dechex($values[1]) . ‘%’ . dechex($values[2]); } else { $unicode .= ‘%’ . dechex($values[0]) . ‘%’ . dechex($values[1]); } $values = array(); $num_octets = 1; } } } return $unicode; }

第二个方法：utf8ToUnicodeEntities，将UTF-8编码的文字转换为&#nnnnn或者&#nnn; n={0..9}的Unicode格式

/**
* takes a string of utf-8 encoded characters and converts it to a string of unicode entities
* each unicode entitiy has the either the form &#nnnnn; or &#nnn; n={0..9} and can be displayed by utf-8 supporting
* browsers.
* If the character passed maps as lower ascii it stays as such (a single char) instead of being presented as a unicode entity
* @author Ronen Botzer
* @param $source string encoded using utf-8 [STRING]
* @return string of unicode entities [STRING]
* @access public
*/
function utf8ToUnicodeEntities ($source) {
// array used to figure what number to decrement from character order value
// according to number of characters used to map unicode to ascii by utf-8
$decrement[4] = 240; // 4字节编码
$decrement[3] = 224; // 3字节编码
$decrement[2] = 192; // 2字节编码
$decrement[1] = 0; // ASCII 

// the number of bits to shift each charNum by
$shift[1][0] = 0; // 0xxxxxxx 

// 110xxxxx 10xxxxxx
$shift[2][0] = 6; // 110xxxxx
$shift[2][1] = 0; // 10xxxxxx 

// 1110xxxx 10xxxxxx 10xxxxxx
$shift[3][0] = 12; // 1110xxxx
$shift[3][1] = 6; // 10xxxxxx
$shift[3][2] = 0; // 10xxxxxx 

// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
$shift[4][0] = 18; // 11110xxx
$shift[4][1] = 12; // 10xxxxxx
$shift[4][2] = 6; // 10xxxxxx
$shift[4][3] = 0; // 10xxxxxx 

$pos = 0;
$len = strlen ($source);
$encodedString = '';
while ($pos < $len) {
$asciiPos = ord (substr ($source, $pos, 1));
// 根据第一个字节开头的1的数量,判断当前文字是2字节编码，还是3字节编码，等等...
// 然后根据这个，将第一个文字取出来。
if (($asciiPos >= 240) && ($asciiPos <= 255)) {
// 4 chars representing one unicode character
$thisLetter = substr ($source, $pos, 4);
$pos += 4;
}
else if (($asciiPos >= 224) && ($asciiPos <= 239)) {
// 3 chars representing one unicode character
$thisLetter = substr ($source, $pos, 3);
$pos += 3;
}
else if (($asciiPos >= 192) && ($asciiPos <= 223)) {
// 2 chars representing one unicode character
$thisLetter = substr ($source, $pos, 2);
$pos += 2;
}
else {
// 1 char (lower ascii)
$thisLetter = substr ($source, $pos, 1);
$pos += 1;
} 

// 对每个文字进行处理，比如处理“中”
// “中”的长度3。
$thisLen = strlen ($thisLetter);
if ($thisLen > 1) {
// process the string representing the letter to a unicode entity
$thisPos = 0;
$decimalCode = 0;
while ($thisPos < $thisLen) {
// 第一个字节的ASCII值
$thisCharOrd = ord (substr ($thisLetter, $thisPos, 1));
if ($thisPos == 0) {
// 比如“中”的第一个字节是11100100，而3字节文字的UTF-8编码的高位要加上11100000即224，
// 也就是$decrement[3]
$charNum = intval ($thisCharOrd - $decrement[$thisLen]);
// 因为放到高位，所以左移12位，$shift[3][0] = 12
// 1110xxxx(第1字节) 10xxxxxx(第2字节) 10xxxxxx(第3字节)
$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
}
else {
// 第2个和以上的字节的高位都要加上10000000，即128
$charNum = intval ($thisCharOrd - 128);
// 将第2个和以上的字节分别左移6位，0位（不移动）
$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
} 

$thisPos++;
} 

if ($thisLen == 1) {
$encodedLetter = "&#". str_pad($decimalCode, 3, "0", STR_PAD_LEFT) . ';';
}
else {
$encodedLetter = "&#". str_pad($decimalCode, 5, "0", STR_PAD_LEFT) . ';';
} 

$encodedString .= $encodedLetter;
}
else {
$encodedString .= $thisLetter;
}
}
return $encodedString;
}