[ Index ] |
PHP Cross Reference of phpwcms V1.5.0 _r431 (28.01.12) |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * UCTC - The Unicode Transcoder 4 * 5 * Converts between various flavours of Unicode representations like UCS-4 or UTF-8 6 * Supported schemes: 7 * - UCS-4 Little Endian / Big Endian / Array (partially) 8 * - UTF-16 Little Endian / Big Endian (not yet) 9 * - UTF-8 10 * - UTF-7 11 * - UTF-7 IMAP (modified UTF-7) 12 * 13 * @package phlyMail Nahariya 4.0+ Default branch 14 * @author Matthias Sommerfeld <mso@phlyLabs.de> 15 * @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de 16 * @version 0.0.6 2009-05-10 17 */ 18 class uctc { 19 private static $mechs = array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap'); 20 private static $allow_overlong = false; 21 private static $safe_mode; 22 private static $safe_char; 23 24 /** 25 * The actual conversion routine 26 * 27 * @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array 28 * @param string $from Original encoding of the data 29 * @param string $to Target encoding of the data 30 * @param bool $safe_mode SafeMode tries to correct invalid codepoints 31 * @return mixed False on failure, String or array on success, depending on target encoding 32 * @access public 33 * @since 0.0.1 34 */ 35 public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC) 36 { 37 self::$safe_mode = ($safe_mode) ? true : false; 38 self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC; 39 if (self::$safe_mode) self::$allow_overlong = true; 40 if (!in_array($from, self::$mechs)) throw new Exception('Invalid input format specified'); 41 if (!in_array($to, self::$mechs)) throw new Exception('Invalid output format specified'); 42 if ($from != 'ucs4array') eval('$data = self::'.$from.'_ucs4array($data);'); 43 if ($to != 'ucs4array') eval('$data = self::ucs4array_'.$to.'($data);'); 44 return $data; 45 } 46 47 /** 48 * This converts an UTF-8 encoded string to its UCS-4 representation 49 * 50 * @param string $input The UTF-8 string to convert 51 * @return array Array of 32bit values representing each codepoint 52 * @access private 53 */ 54 private static function utf8_ucs4array($input) 55 { 56 $output = array(); 57 $out_len = 0; 58 $inp_len = strlen($input); 59 $mode = 'next'; 60 $test = 'none'; 61 for ($k = 0; $k < $inp_len; ++$k) { 62 $v = ord($input{$k}); // Extract byte from input string 63 64 if ($v < 128) { // We found an ASCII char - put into stirng as is 65 $output[$out_len] = $v; 66 ++$out_len; 67 if ('add' == $mode) { 68 if (self::$safe_mode) { 69 $output[$out_len-2] = self::$safe_char; 70 $mode = 'next'; 71 } else { 72 throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 73 } 74 } 75 continue; 76 } 77 if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char 78 $start_byte = $v; 79 $mode = 'add'; 80 $test = 'range'; 81 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx 82 $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left 83 $v = ($v - 192) << 6; 84 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx 85 $next_byte = 1; 86 $v = ($v - 224) << 12; 87 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 88 $next_byte = 2; 89 $v = ($v - 240) << 18; 90 } elseif (self::$safe_mode) { 91 $mode = 'next'; 92 $output[$out_len] = self::$safe_char; 93 ++$out_len; 94 continue; 95 } else { 96 throw new Exception('This might be UTF-8, but I don\'t understand it at byte '.$k); 97 } 98 if ($inp_len-$k-$next_byte < 2) { 99 $output[$out_len] = self::$safe_char; 100 $mode = 'no'; 101 continue; 102 } 103 104 if ('add' == $mode) { 105 $output[$out_len] = (int) $v; 106 ++$out_len; 107 continue; 108 } 109 } 110 if ('add' == $mode) { 111 if (!self::$allow_overlong && $test == 'range') { 112 $test = 'none'; 113 if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) { 114 throw new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.$k); 115 } 116 } 117 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx 118 $v = ($v-128) << ($next_byte*6); 119 $output[($out_len-1)] += $v; 120 --$next_byte; 121 } else { 122 if (self::$safe_mode) { 123 $output[$out_len-1] = ord(self::$safe_char); 124 $k--; 125 $mode = 'next'; 126 continue; 127 } else { 128 throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 129 } 130 } 131 if ($next_byte < 0) { 132 $mode = 'next'; 133 } 134 } 135 } // for 136 return $output; 137 } 138 139 /** 140 * Convert UCS-4 string into UTF-8 string 141 * See utf8_ucs4array() for details 142 * @access private 143 */ 144 private static function ucs4array_utf8($input) 145 { 146 $output = ''; 147 foreach ($input as $v) { 148 if ($v < 128) { // 7bit are transferred literally 149 $output .= chr($v); 150 } elseif ($v < (1 << 11)) { // 2 bytes 151 $output .= chr(192+($v >> 6)).chr(128+($v & 63)); 152 } elseif ($v < (1 << 16)) { // 3 bytes 153 $output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63)); 154 } elseif ($v < (1 << 21)) { // 4 bytes 155 $output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63)); 156 } elseif (self::$safe_mode) { 157 $output .= self::$safe_char; 158 } else { 159 throw new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k); 160 } 161 } 162 return $output; 163 } 164 165 private static function utf7imap_ucs4array($input) 166 { 167 return self::utf7_ucs4array(str_replace(',', '/', $input), '&'); 168 } 169 170 private static function utf7_ucs4array($input, $sc = '+') 171 { 172 $output = array(); 173 $out_len = 0; 174 $inp_len = strlen($input); 175 $mode = 'd'; 176 $b64 = ''; 177 178 for ($k = 0; $k < $inp_len; ++$k) { 179 $c = $input{$k}; 180 if (0 == ord($c)) continue; // Ignore zero bytes 181 if ('b' == $mode) { 182 // Sequence got terminated 183 if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) { 184 if ('-' == $c) { 185 if ($b64 == '') { 186 $output[$out_len] = ord($sc); 187 $out_len++; 188 $mode = 'd'; 189 continue; 190 } 191 } 192 $tmp = base64_decode($b64); 193 $tmp = substr($tmp, -1 * (strlen($tmp) % 2)); 194 for ($i = 0; $i < strlen($tmp); $i++) { 195 if ($i % 2) { 196 $output[$out_len] += ord($tmp{$i}); 197 $out_len++; 198 } else { 199 $output[$out_len] = ord($tmp{$i}) << 8; 200 } 201 } 202 $mode = 'd'; 203 $b64 = ''; 204 continue; 205 } else { 206 $b64 .= $c; 207 } 208 } 209 if ('d' == $mode) { 210 if ($sc == $c) { 211 $mode = 'b'; 212 continue; 213 } 214 $output[$out_len] = ord($c); 215 $out_len++; 216 } 217 } 218 return $output; 219 } 220 221 private static function ucs4array_utf7imap($input) 222 { 223 return str_replace('/', ',', self::ucs4array_utf7($input, '&')); 224 } 225 226 private static function ucs4array_utf7($input, $sc = '+') 227 { 228 $output = ''; 229 $mode = 'd'; 230 $b64 = ''; 231 while (true) { 232 $v = (!empty($input)) ? array_shift($input) : false; 233 $is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true; 234 if ($mode == 'b') { 235 if ($is_direct) { 236 if ($b64 == chr(0).$sc) { 237 $output .= $sc.'-'; 238 $b64 = ''; 239 } elseif ($b64) { 240 $output .= $sc.str_replace('=', '', base64_encode($b64)).'-'; 241 $b64 = ''; 242 } 243 $mode = 'd'; 244 } elseif (false !== $v) { 245 $b64 .= chr(($v >> 8) & 255). chr($v & 255); 246 } 247 } 248 if ($mode == 'd' && false !== $v) { 249 if ($is_direct) { 250 $output .= chr($v); 251 } else { 252 $b64 = chr(($v >> 8) & 255). chr($v & 255); 253 $mode = 'b'; 254 } 255 } 256 if (false === $v && $b64 == '') break; 257 } 258 return $output; 259 } 260 261 /** 262 * Convert UCS-4 array into UCS-4 string (Little Endian at the moment) 263 * @access private 264 */ 265 private static function ucs4array_ucs4($input) 266 { 267 $output = ''; 268 foreach ($input as $v) { 269 $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); 270 } 271 return $output; 272 } 273 274 /** 275 * Convert UCS-4 string (LE in the moment) into UCS-4 garray 276 * @access private 277 */ 278 private static function ucs4_ucs4array($input) 279 { 280 $output = array(); 281 282 $inp_len = strlen($input); 283 // Input length must be dividable by 4 284 if ($inp_len % 4) { 285 throw new Exception('Input UCS4 string is broken'); 286 } 287 // Empty input - return empty output 288 if (!$inp_len) return $output; 289 290 for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { 291 if (!($i % 4)) { // Increment output position every 4 input bytes 292 $out_len++; 293 $output[$out_len] = 0; 294 } 295 $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) ); 296 } 297 return $output; 298 } 299 } 300 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Sun Jan 29 16:31:14 2012 | Cross-referenced by PHPXref 0.7.1 |