[ Index ]

PHP Cross Reference of phpwcms V1.5.0 _r431 (28.01.12)

title

Body

[close]

/include/inc_ext/idna_convert/ -> uctc.php (source)

   1  <?php
   2  /**
   3   * UCTC - The Unicode Transcoder
   4   *
   5   * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
   6   * Supported schemes:
   7   * - UCS-4 Little Endian / Big Endian / Array (partially)
   8   * - UTF-16 Little Endian / Big Endian (not yet)
   9   * - UTF-8
  10   * - UTF-7
  11   * - UTF-7 IMAP (modified UTF-7)
  12   *
  13   * @package phlyMail Nahariya 4.0+ Default branch
  14   * @author Matthias Sommerfeld  <mso@phlyLabs.de>
  15   * @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de
  16   * @version 0.0.6 2009-05-10
  17   */
  18  class uctc {
  19      private static $mechs = array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap');
  20      private static $allow_overlong = false;
  21      private static $safe_mode;
  22      private static $safe_char;
  23  
  24      /**
  25       * The actual conversion routine
  26       *
  27       * @param mixed $data  The data to convert, usually a string, array when converting from UCS-4 array
  28       * @param string $from  Original encoding of the data
  29       * @param string $to  Target encoding of the data
  30       * @param bool $safe_mode  SafeMode tries to correct invalid codepoints
  31       * @return mixed  False on failure, String or array on success, depending on target encoding
  32       * @access public
  33       * @since 0.0.1
  34       */
  35      public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC)
  36      {
  37          self::$safe_mode = ($safe_mode) ? true : false;
  38          self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC;
  39          if (self::$safe_mode) self::$allow_overlong = true;
  40          if (!in_array($from, self::$mechs)) throw new Exception('Invalid input format specified');
  41          if (!in_array($to, self::$mechs)) throw new Exception('Invalid output format specified');
  42          if ($from != 'ucs4array') eval('$data = self::'.$from.'_ucs4array($data);');
  43          if ($to != 'ucs4array') eval('$data = self::ucs4array_'.$to.'($data);');
  44          return $data;
  45      }
  46  
  47      /**
  48       * This converts an UTF-8 encoded string to its UCS-4 representation
  49       *
  50       * @param string $input  The UTF-8 string to convert
  51       * @return array  Array of 32bit values representing each codepoint
  52       * @access private
  53       */
  54      private static function utf8_ucs4array($input)
  55      {
  56          $output = array();
  57          $out_len = 0;
  58          $inp_len = strlen($input);
  59          $mode = 'next';
  60          $test = 'none';
  61          for ($k = 0; $k < $inp_len; ++$k) {
  62              $v = ord($input{$k}); // Extract byte from input string
  63  
  64              if ($v < 128) { // We found an ASCII char - put into stirng as is
  65                  $output[$out_len] = $v;
  66                  ++$out_len;
  67                  if ('add' == $mode) {
  68                      if (self::$safe_mode) {
  69                          $output[$out_len-2] = self::$safe_char;
  70                          $mode = 'next';
  71                      } else {
  72                          throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
  73                      }
  74                  }
  75                  continue;
  76              }
  77              if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  78                  $start_byte = $v;
  79                  $mode = 'add';
  80                  $test = 'range';
  81                  if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  82                      $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  83                      $v = ($v - 192) << 6;
  84                  } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  85                      $next_byte = 1;
  86                      $v = ($v - 224) << 12;
  87                  } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  88                      $next_byte = 2;
  89                      $v = ($v - 240) << 18;
  90                  } elseif (self::$safe_mode) {
  91                      $mode = 'next';
  92                      $output[$out_len] = self::$safe_char;
  93                      ++$out_len;
  94                      continue;
  95                  } else {
  96                      throw new Exception('This might be UTF-8, but I don\'t understand it at byte '.$k);
  97                  }
  98                  if ($inp_len-$k-$next_byte < 2) {
  99                      $output[$out_len] = self::$safe_char;
 100                      $mode = 'no';
 101                      continue;
 102                  }
 103  
 104                  if ('add' == $mode) {
 105                      $output[$out_len] = (int) $v;
 106                      ++$out_len;
 107                      continue;
 108                  }
 109              }
 110              if ('add' == $mode) {
 111                  if (!self::$allow_overlong && $test == 'range') {
 112                      $test = 'none';
 113                      if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
 114                          throw new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
 115                      }
 116                  }
 117                  if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
 118                      $v = ($v-128) << ($next_byte*6);
 119                      $output[($out_len-1)] += $v;
 120                      --$next_byte;
 121                  } else {
 122                      if (self::$safe_mode) {
 123                          $output[$out_len-1] = ord(self::$safe_char);
 124                          $k--;
 125                          $mode = 'next';
 126                          continue;
 127                      } else {
 128                          throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
 129                      }
 130                  }
 131                  if ($next_byte < 0) {
 132                      $mode = 'next';
 133                  }
 134              }
 135          } // for
 136          return $output;
 137      }
 138  
 139      /**
 140       * Convert UCS-4 string into UTF-8 string
 141       * See utf8_ucs4array() for details
 142       * @access   private
 143       */
 144      private static function ucs4array_utf8($input)
 145      {
 146          $output = '';
 147          foreach ($input as $v) {
 148              if ($v < 128) { // 7bit are transferred literally
 149                  $output .= chr($v);
 150              } elseif ($v < (1 << 11)) { // 2 bytes
 151                  $output .= chr(192+($v >> 6)).chr(128+($v & 63));
 152              } elseif ($v < (1 << 16)) { // 3 bytes
 153                  $output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
 154              } elseif ($v < (1 << 21)) { // 4 bytes
 155                  $output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
 156              } elseif (self::$safe_mode) {
 157                  $output .= self::$safe_char;
 158              } else {
 159                  throw new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
 160              }
 161          }
 162          return $output;
 163      }
 164  
 165      private static function utf7imap_ucs4array($input)
 166      {
 167          return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
 168      }
 169  
 170      private static function utf7_ucs4array($input, $sc = '+')
 171      {
 172          $output  = array();
 173          $out_len = 0;
 174          $inp_len = strlen($input);
 175          $mode    = 'd';
 176          $b64     = '';
 177  
 178          for ($k = 0; $k < $inp_len; ++$k) {
 179              $c = $input{$k};
 180              if (0 == ord($c)) continue; // Ignore zero bytes
 181              if ('b' == $mode) {
 182                  // Sequence got terminated
 183                  if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
 184                      if ('-' == $c) {
 185                          if ($b64 == '') {
 186                              $output[$out_len] = ord($sc);
 187                              $out_len++;
 188                              $mode = 'd';
 189                              continue;
 190                          }
 191                      }
 192                      $tmp = base64_decode($b64);
 193                      $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
 194                      for ($i = 0; $i < strlen($tmp); $i++) {
 195                          if ($i % 2) {
 196                              $output[$out_len] += ord($tmp{$i});
 197                              $out_len++;
 198                          } else {
 199                              $output[$out_len] = ord($tmp{$i}) << 8;
 200                          }
 201                      }
 202                      $mode = 'd';
 203                      $b64 = '';
 204                      continue;
 205                  } else {
 206                      $b64 .= $c;
 207                  }
 208              }
 209              if ('d' == $mode) {
 210                  if ($sc == $c) {
 211                      $mode = 'b';
 212                      continue;
 213                  }
 214                  $output[$out_len] = ord($c);
 215                  $out_len++;
 216              }
 217          }
 218          return $output;
 219      }
 220  
 221      private static function ucs4array_utf7imap($input)
 222      {
 223          return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
 224      }
 225  
 226      private static function ucs4array_utf7($input, $sc = '+')
 227      {
 228          $output = '';
 229          $mode = 'd';
 230          $b64 = '';
 231          while (true) {
 232              $v = (!empty($input)) ? array_shift($input) : false;
 233              $is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true;
 234              if ($mode == 'b') {
 235                  if ($is_direct) {
 236                      if ($b64 == chr(0).$sc) {
 237                          $output .= $sc.'-';
 238                          $b64 = '';
 239                      } elseif ($b64) {
 240                          $output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
 241                          $b64 = '';
 242                      }
 243                      $mode = 'd';
 244                  } elseif (false !== $v) {
 245                      $b64 .= chr(($v >> 8) & 255). chr($v & 255);
 246                  }
 247              }
 248              if ($mode == 'd' && false !== $v) {
 249                  if ($is_direct) {
 250                      $output .= chr($v);
 251                  } else {
 252                      $b64 = chr(($v >> 8) & 255). chr($v & 255);
 253                      $mode = 'b';
 254                  }
 255              }
 256              if (false === $v && $b64 == '') break;
 257          }
 258          return $output;
 259      }
 260  
 261      /**
 262       * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
 263       * @access   private
 264       */
 265      private static function ucs4array_ucs4($input)
 266      {
 267          $output = '';
 268          foreach ($input as $v) {
 269              $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
 270          }
 271          return $output;
 272      }
 273  
 274      /**
 275       * Convert UCS-4 string (LE in the moment) into UCS-4 garray
 276       * @access   private
 277       */
 278      private static function ucs4_ucs4array($input)
 279      {
 280          $output = array();
 281  
 282          $inp_len = strlen($input);
 283          // Input length must be dividable by 4
 284          if ($inp_len % 4) {
 285              throw new Exception('Input UCS4 string is broken');
 286          }
 287          // Empty input - return empty output
 288          if (!$inp_len) return $output;
 289  
 290          for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
 291              if (!($i % 4)) { // Increment output position every 4 input bytes
 292                  $out_len++;
 293                  $output[$out_len] = 0;
 294              }
 295              $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
 296          }
 297          return $output;
 298      }
 299  }
 300  ?>


Generated: Sun Jan 29 16:31:14 2012 Cross-referenced by PHPXref 0.7.1