[ Index ]

PHP Cross Reference of phpwcms V1.4.3 _r380 (23.11.09)

title

Body

[close]

/include/inc_ext/htmlfilter/ -> htmlfilter.php (source)

   1  <?php
   2  /**
   3   * htmlfilter.inc
   4   * ---------------
   5   * This set of functions allows you to filter html in order to remove
   6   * any malicious tags from it. Useful in cases when you need to filter
   7   * user input for any cross-site-scripting attempts.
   8   *
   9   * Copyright (C) 2002-2004 by Duke University
  10   *
  11   * This library is free software; you can redistribute it and/or
  12   * modify it under the terms of the GNU Lesser General Public
  13   * License as published by the Free Software Foundation; either
  14   * version 2.1 of the License, or (at your option) any later version.
  15   *
  16   * This library is distributed in the hope that it will be useful,
  17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19   * Lesser General Public License for more details.
  20   *
  21   * You should have received a copy of the GNU Lesser General Public
  22   * License along with this library; if not, write to the Free Software
  23   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  
  24   * 02110-1301  USA
  25   *
  26   * @Author  Konstantin Riabitsev <icon@linux.duke.edu>
  27   * @Version 1.1 ($Date: 2005/06/30 18:06:08 $)
  28   */
  29  
  30  /**
  31   * This is a debugging function used throughout the code. To enable
  32   * debugging you have to specify a global variable called "debug" before
  33   * calling sanitize() and set it to true. 
  34   *
  35   * Note: Although insignificantly, debugging does slow you down even
  36   * when $debug is set to false. If you wish to get rid of all
  37   * debugging calls, run the following command:
  38   *
  39   * fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
  40   *
  41   * htmlfilter.inc.new will contain no debugging calls.
  42   *
  43   * @param  $message  A string with the message to output.
  44   * @return           void.
  45   */
  46  function spew($message){
  47      global $debug;
  48      if ($debug == true){
  49          echo "$message";
  50      }
  51  }
  52  
  53  /**
  54   * This function returns the final tag out of the tag name, an array
  55   * of attributes, and the type of the tag. This function is called by 
  56   * sanitize internally.
  57   *
  58   * @param  $tagname  the name of the tag.
  59   * @param  $attary   the array of attributes and their values
  60   * @param  $tagtype  The type of the tag (see in comments).
  61   * @return           a string with the final tag representation.
  62   */
  63  function tagprint($tagname, $attary, $tagtype){
  64      $me = 'tagprint';
  65      if ($tagtype == 2){
  66          $fulltag = '</' . $tagname . '>';
  67      } else {
  68          $fulltag = '<' . $tagname;
  69          if (is_array($attary) && sizeof($attary)){
  70              $atts = Array();
  71              while (list($attname, $attvalue) = each($attary)){
  72                  array_push($atts, "$attname=$attvalue");
  73              }
  74              $fulltag .= ' ' . join(' ', $atts);
  75          }
  76          if ($tagtype == 3){
  77              $fulltag .= ' /';
  78          }
  79          $fulltag .= '>';
  80      }
  81      spew("$me: $fulltag\n");
  82      return $fulltag;
  83  }
  84  
  85  /**
  86   * A small helper function to use with array_walk. Modifies a by-ref
  87   * value and makes it lowercase.
  88   *
  89   * @param  $val a value passed by-ref.
  90   * @return      void since it modifies a by-ref value.
  91   */
  92  function casenormalize(&$val){
  93      $val = strtolower($val);
  94  }
  95  
  96  /**
  97   * This function skips any whitespace from the current position within
  98   * a string and to the next non-whitespace value.
  99   * 
 100   * @param  $body   the string
 101   * @param  $offset the offset within the string where we should start
 102   *                 looking for the next non-whitespace character.
 103   * @return         the location within the $body where the next
 104   *                 non-whitespace char is located.
 105   */
 106  function skipspace($body, $offset){
 107      $me = 'skipspace';
 108      preg_match('/^(\s*)/s', substr($body, $offset), $matches);
 109      if (sizeof($matches{1})){
 110          $count = strlen($matches{1});
 111          spew("$me: skipped $count chars\n");
 112          $offset += $count;
 113      }
 114      return $offset;
 115  }
 116  
 117  /**
 118   * This function looks for the next character within a string.  It's
 119   * really just a glorified "strpos", except it catches the failures
 120   * nicely.
 121   *
 122   * @param  $body   The string to look for needle in.
 123   * @param  $offset Start looking from this position.
 124   * @param  $needle The character/string to look for.
 125   * @return         location of the next occurance of the needle, or
 126   *                 strlen($body) if needle wasn't found.
 127   */
 128  function findnxstr($body, $offset, $needle){
 129      $me = 'findnxstr';
 130      $pos = strpos($body, $needle, $offset);
 131      if ($pos === FALSE){
 132          $pos = strlen($body);
 133          spew("$me: end of body reached\n");
 134      }
 135      spew("$me: '$needle' found at pos $pos\n");
 136      return $pos;
 137  }
 138  
 139  /**
 140   * This function takes a PCRE-style regexp and tries to match it
 141   * within the string.
 142   *
 143   * @param  $body   The string to look for needle in.
 144   * @param  $offset Start looking from here.
 145   * @param  $reg    A PCRE-style regex to match.
 146   * @return         Returns a false if no matches found, or an array
 147   *                 with the following members:
 148   *                 - integer with the location of the match within $body
 149   *                 - string with whatever content between offset and the match
 150   *                 - string with whatever it is we matched
 151   */
 152  function findnxreg($body, $offset, $reg){
 153      $me = 'findnxreg';
 154      $matches = Array();
 155      $retarr = Array();
 156      $preg_rule = '%^(.*?)(' . $reg . ')%s';
 157      preg_match($preg_rule, substr($body, $offset), $matches);
 158      if (!isset($matches{0})){
 159          spew("$me: No matches found.\n");
 160          $retarr = false;
 161      } else {
 162          $retarr{0} = $offset + strlen($matches{1});
 163          $retarr{1} = $matches{1};
 164          $retarr{2} = $matches{2};
 165          spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
 166      }
 167      return $retarr;
 168  }
 169  
 170  /**
 171   * This function looks for the next tag.
 172   *
 173   * @param  $body   String where to look for the next tag.
 174   * @param  $offset Start looking from here.
 175   * @return         false if no more tags exist in the body, or
 176   *                 an array with the following members:
 177   *                 - string with the name of the tag
 178   *                 - array with attributes and their values
 179   *                 - integer with tag type (1, 2, or 3)
 180   *                 - integer where the tag starts (starting "<")
 181   *                 - integer where the tag ends (ending ">")
 182   *                 first three members will be false, if the tag is invalid.
 183   */
 184  function getnxtag($body, $offset){
 185      $me = 'getnxtag';
 186      if ($offset > strlen($body)){
 187          spew("$me: Past the end of body\n");
 188          return false;
 189      }
 190      $lt = findnxstr($body, $offset, '<');
 191      if ($lt == strlen($body)){
 192          spew("$me: No more tags found!\n");
 193          return false;
 194      }
 195      /**
 196       * We are here:
 197       * blah blah <tag attribute="value">
 198       * \---------^
 199       */
 200      spew("$me: Found '<' at pos $lt\n");
 201      $pos = skipspace($body, $lt + 1);
 202      if ($pos >= strlen($body)){
 203          spew("$me: End of body reached.\n");
 204          return Array(false, false, false, $lt, strlen($body));
 205      }
 206      /**
 207       * There are 3 kinds of tags:
 208       * 1. Opening tag, e.g.:
 209       *    <a href="blah">
 210       * 2. Closing tag, e.g.:
 211       *    </a>
 212       * 3. XHTML-style content-less tag, e.g.:
 213       *    <img src="blah"/>
 214       */
 215      $tagtype = false;
 216      switch (substr($body, $pos, 1)){
 217      case '/':
 218          spew("$me: This is a closing tag (type 2)\n");
 219          $tagtype = 2;
 220          $pos++;
 221          break;
 222      case '!':
 223          /**
 224           * A comment or an SGML declaration.
 225           */
 226          if (substr($body, $pos+1, 2) == '--'){
 227              spew("$me: A comment found. Stripping.\n");
 228              $gt = strpos($body, '-->', $pos);
 229              if ($gt === false){
 230                  $gt = strlen($body);
 231              } else {
 232                  $gt += 2;
 233              }
 234              return Array(false, false, false, $lt, $gt);
 235          } else {
 236              spew("$me: An SGML declaration found. Stripping.\n");
 237              $gt = findnxstr($body, $pos, '>');
 238              return Array(false, false, false, $lt, $gt);
 239          }
 240          break;
 241      default:
 242          /**
 243           * Assume tagtype 1 for now. If it's type 3, we'll switch values
 244           * later.
 245           */
 246          $tagtype = 1;
 247          break;
 248      }
 249      
 250      $tag_start = $pos;
 251      $tagname = '';
 252      /**
 253       * Look for next [\W-_], which will indicate the end of the tag name.
 254       */
 255      $regary = findnxreg($body, $pos, '[^\w\-_]');
 256      if ($regary == false){
 257          spew("$me: End of body reached while analyzing tag name\n");
 258          return Array(false, false, false, $lt, strlen($body));
 259      }
 260      list($pos, $tagname, $match) = $regary;
 261      $tagname = strtolower($tagname);
 262      
 263      /**
 264       * $match can be either of these:
 265       * '>'  indicating the end of the tag entirely.
 266       * '\s' indicating the end of the tag name.
 267       * '/'  indicating that this is type-3 xhtml tag.
 268       * 
 269       * Whatever else we find there indicates an invalid tag.
 270       */
 271      switch ($match){
 272      case '/':
 273          /**
 274           * This is an xhtml-style tag with a closing / at the
 275           * end, like so: <img src="blah"/>. Check if it's followed
 276           * by the closing bracket. If not, then this tag is invalid
 277           */
 278          if (substr($body, $pos, 2) == '/>'){
 279              spew("$me: XHTML-style tag found.\n");
 280              $pos++;
 281              spew("$me: Setting tagtype to 3\n");
 282              $tagtype = 3;
 283          } else {
 284              spew("$me: Found invalid character '/'.\n");
 285              $gt = findnxstr($body, $pos, '>');
 286              spew("$me: Tag is invalid. Returning.\n");
 287              $retary = Array(false, false, false, $lt, $gt);
 288              return $retary;
 289          }
 290      case '>':
 291          spew("$me: End of tag found at $pos\n");
 292          spew("$me: Tagname is '$tagname'\n");
 293          spew("$me: This tag has no attributes\n");
 294          return Array($tagname, false, $tagtype, $lt, $pos);
 295          break;
 296      default:
 297          /**
 298           * Check if it's whitespace
 299           */
 300          if (preg_match('/\s/', $match)){
 301              spew("$me: Tagname is '$tagname'\n");
 302          } else {
 303              /**
 304               * This is an invalid tag! Look for the next closing ">".
 305               */
 306              spew("$me: Invalid characters found in tag name: $match\n");
 307              $gt = findnxstr($body, $lt, '>');
 308              return Array(false, false, false, $lt, $gt);
 309          }
 310      }
 311      
 312      /**
 313       * At this point we're here:
 314       * <tagname  attribute='blah'>
 315       * \-------^
 316       *
 317       * At this point we loop in order to find all attributes.
 318       */
 319      $attname = '';
 320      $atttype = false;
 321      $attary = Array();
 322      
 323      while ($pos <= strlen($body)){
 324          $pos = skipspace($body, $pos);
 325          if ($pos == strlen($body)){
 326              /**
 327               * Non-closed tag.
 328               */
 329              spew("$me: End of body reached before end of tag. Discarding.\n");
 330              return Array(false, false, false, $lt, $pos);
 331          }
 332          /**
 333           * See if we arrived at a ">" or "/>", which means that we reached
 334           * the end of the tag.
 335           */
 336          $matches = Array();
 337          preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
 338          if (isset($matches{0}) && $matches{0}){
 339              /**
 340               * Yep. So we did.
 341               */
 342              spew("$me: Arrived at the end of the tag.\n");
 343              $pos += strlen($matches{1});
 344              if ($matches{2} == '/>'){
 345                  $tagtype = 3;
 346                  $pos++;
 347              }
 348              return Array($tagname, $attary, $tagtype, $lt, $pos);
 349          }
 350          
 351          /**
 352           * There are several types of attributes, with optional
 353           * [:space:] between members.
 354           * Type 1:
 355           *   attrname[:space:]=[:space:]'CDATA'
 356           * Type 2:
 357           *   attrname[:space:]=[:space:]"CDATA"
 358           * Type 3:
 359           *   attr[:space:]=[:space:]CDATA
 360           * Type 4:
 361           *   attrname
 362           *
 363           * We leave types 1 and 2 the same, type 3 we check for
 364           * '"' and convert to "&quot" if needed, then wrap in
 365           * double quotes. Type 4 we convert into:
 366           * attrname="yes".
 367           */
 368          $regary = findnxreg($body, $pos, '[^\w\-_]');
 369          if ($regary == false){
 370              /**
 371               * Looks like body ended before the end of tag.
 372               */
 373              spew("$me: End of body found before end of tag.\n");
 374              spew("$me: Invalid, returning\n");
 375              return Array(false, false, false, $lt, strlen($body));
 376          }
 377          list($pos, $attname, $match) = $regary;
 378          $attname = strtolower($attname);
 379          spew("$me: Attribute '$attname' found\n");
 380          /**
 381           * We arrived at the end of attribute name. Several things possible
 382           * here:
 383           * '>'  means the end of the tag and this is attribute type 4
 384           * '/'  if followed by '>' means the same thing as above
 385           * '\s' means a lot of things -- look what it's followed by.
 386           *      anything else means the attribute is invalid.
 387           */
 388          switch($match){
 389          case '/':
 390              /**
 391               * This is an xhtml-style tag with a closing / at the
 392               * end, like so: <img src="blah"/>. Check if it's followed
 393               * by the closing bracket. If not, then this tag is invalid
 394               */
 395              if (substr($body, $pos, 2) == '/>'){
 396                  spew("$me: This is an xhtml-style tag.\n");
 397                  $pos++;
 398                  spew("$me: Setting tagtype to 3\n");
 399                  $tagtype = 3;
 400              } else {
 401                  spew("$me: Found invalid character '/'.\n");
 402                  $gt = findnxstr($body, $pos, '>');
 403                  spew("$me: Tag is invalid. Returning.\n");
 404                  $retary = Array(false, false, false, $lt, $gt);
 405                  return $retary;
 406              }
 407          case '>':
 408              spew("$me: found type 4 attribute.\n");
 409              spew("$me: Additionally, end of tag found at $pos\n");
 410              spew("$me: Attname is '$attname'\n");
 411              spew("$me: Setting attvalue to 'yes'\n");
 412              $attary{$attname} = '"yes"';
 413              return Array($tagname, $attary, $tagtype, $lt, $pos);
 414              break;
 415          default:
 416              /**
 417               * Skip whitespace and see what we arrive at.
 418               */
 419              $pos = skipspace($body, $pos);
 420              $char = substr($body, $pos, 1);
 421              /**
 422               * Two things are valid here:
 423               * '=' means this is attribute type 1 2 or 3.
 424               * \w means this was attribute type 4.
 425               * anything else we ignore and re-loop. End of tag and
 426               * invalid stuff will be caught by our checks at the beginning
 427               * of the loop.
 428               */
 429              if ($char == '='){
 430                  spew("$me: Attribute type 1, 2, or 3 found.\n");
 431                  $pos++;
 432                  $pos = skipspace($body, $pos);
 433                  /**
 434                   * Here are 3 possibilities:
 435                   * "'"  attribute type 1
 436                   * '"'  attribute type 2
 437                   * everything else is the content of tag type 3
 438                   */
 439                  $quot = substr($body, $pos, 1);
 440                  if ($quot == '\''){
 441                      spew("$me: In fact, this is attribute type 1\n");
 442                      spew("$me: looking for closing quote\n");
 443                      $regary = findnxreg($body, $pos+1, '\'');
 444                      if ($regary == false){
 445                          spew("$me: end of body reached before end of val\n");
 446                          spew("$me: Returning\n");
 447                          return Array(false, false, false, $lt, strlen($body));
 448                      }
 449                      list($pos, $attval, $match) = $regary;
 450                      spew("$me: Attvalue is '$attval'\n");
 451                      $pos++;
 452                      $attary{$attname} = '\'' . $attval . '\'';
 453                  } else if ($quot == '"'){
 454                      spew("$me: In fact, this is attribute type 2\n");
 455                      spew("$me: looking for closing quote\n");
 456                      $regary = findnxreg($body, $pos+1, '\"');
 457                      if ($regary == false){
 458                          spew("$me: end of body reached before end of val\n");
 459                          spew("$me: Returning\n");
 460                          return Array(false, false, false, $lt, strlen($body));
 461                      }
 462                      list($pos, $attval, $match) = $regary;
 463                      spew("$me: Attvalue is \"$attval\"\n");
 464                      $pos++;
 465                      $attary{$attname} = '"' . $attval . '"';
 466                  } else {
 467                      spew("$me: This looks like attribute type 3\n");
 468                      /**
 469                       * These are hateful. Look for \s, or >.
 470                       */
 471                      spew("$me: Looking for end of attval\n");
 472                      $regary = findnxreg($body, $pos, '[\s>]');
 473                      if ($regary == false){
 474                          spew("$me: end of body reached before end of val\n");
 475                          spew("$me: Returning\n");
 476                          return Array(false, false, false, $lt, strlen($body));
 477                      }
 478                      list($pos, $attval, $match) = $regary;
 479                      /**
 480                       * If it's ">" it will be caught at the top.
 481                       */
 482                      spew("$me: translating '\"' into &quot;\n");
 483                      $attval = preg_replace('/\"/s', '&quot;', $attval);
 484                      spew("$me: wrapping in quotes\n");
 485                      $attary{$attname} = '"' . $attval . '"';
 486                  }
 487              } else if (preg_match('|[\w/>]|', $char)) {
 488                  /**
 489                   * That was attribute type 4.
 490                   */
 491                  spew("$me: attribute type 4 found.\n");
 492                  spew("$me: Setting value to 'yes'\n");
 493                  $attary{$attname} = '"yes"';
 494              } else {
 495                  /**
 496                   * An illegal character. Find next '>' and return.
 497                   */
 498                  spew("$me: illegal character '$char' found.\n");
 499                  spew("$me: returning\n");
 500                  $gt = findnxstr($body, $pos, '>');
 501                  return Array(false, false, false, $lt, $gt);
 502              }
 503          }
 504      }
 505      /**
 506       * The fact that we got here indicates that the tag end was never
 507       * found. Return invalid tag indication so it gets stripped.
 508       */
 509      spew("$me: No tag end found\n");
 510      return Array(false, false, false, $lt, strlen($body));
 511  }
 512  
 513  /**
 514   * Translates entities into literal values so they can be checked.
 515   *
 516   * @param $attvalue the by-ref value to check.
 517   * @param $regex    the regular expression to check against.
 518   * @param $hex      whether the entites are hexadecimal.
 519   * @return          True or False depending on whether there were matches.
 520   */
 521  function deent(&$attvalue, $regex, $hex=false){
 522      $me = 'deent';
 523      spew("$me: matching '$regex' against: $attvalue\n");
 524      $ret_match = false;
 525      preg_match_all($regex, $attvalue, $matches);
 526      if (is_array($matches) && sizeof($matches[0]) > 0){
 527          spew("$me: found " . sizeof($matches[0]) . " matches\n");
 528          $repl = Array();
 529          for ($i = 0; $i < sizeof($matches[0]); $i++){
 530              $numval = $matches[1][$i];
 531              spew("$me: numval is $numval\n");
 532              if ($hex){
 533                  $numval = hexdec($numval);
 534                  spew("$me: hex! Numval is now $numval\n");
 535              }
 536              $repl{$matches[0][$i]} = chr($numval);
 537          }
 538          $attvalue = strtr($attvalue, $repl);
 539          spew("$me: attvalue after translation: $attvalue\n");
 540          return true;
 541      } else {
 542          spew("$me: no matches! Returning false.\n");
 543          return false;
 544      }
 545  }
 546  
 547  /**
 548   * This function checks attribute values for entity-encoded values
 549   * and returns them translated into 8-bit strings so we can run
 550   * checks on them.
 551   *
 552   * @param  $attvalue A string to run entity check against.
 553   * @return           Nothing, modifies a reference value.
 554   */
 555  function defang(&$attvalue){
 556      $me = 'defang';
 557      /**
 558       * Skip this if there aren't ampersands or backslashes.
 559       */
 560      spew("$me: Checking '$attvalue' for suspicious content\n");
 561      if (strpos($attvalue, '&') === false
 562          && strpos($attvalue, '\\') === false){
 563          spew("$me: no suspicious content found, returning.\n");
 564          return;
 565      }
 566      $m = false;
 567      do {
 568          $m = false;
 569          $m = $m || deent($attvalue, '/\&#0*(\d+);*/s');
 570          $m = $m || deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
 571          $m = $m || deent($attvalue, '/\\\\(\d+)/s', true);
 572          spew("$me: m=$m\n");
 573      } while ($m == true);
 574      $attvalue = stripslashes($attvalue);
 575      spew("$me: translated into: $attvalue\n");
 576  }
 577  
 578  /**
 579   * Kill any tabs, newlines, or carriage returns. Our friends the
 580   * makers of the browser with 95% market value decided that it'd
 581   * be funny to make "java[tab]script" be just as good as "javascript".
 582   * 
 583   * @param  attvalue  The attribute value before extraneous spaces removed.
 584   * @return attvalue  Nothing, modifies a reference value.
 585   */
 586  function unspace(&$attvalue){
 587      $me = 'unspace';
 588      if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
 589          spew("$me: Killing whitespace.\n");
 590          $attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "), 
 591                                  Array('',   '',   '',   '',   ''), $attvalue);
 592      }
 593      spew("$me: after unspace: $attvalue\n");
 594  }
 595  
 596  /**
 597   * This function runs various checks against the attributes.
 598   *
 599   * @param  $tagname         String with the name of the tag.
 600   * @param  $attary          Array with all tag attributes.
 601   * @param  $rm_attnames     See description for sanitize
 602   * @param  $bad_attvals     See description for sanitize
 603   * @param  $add_attr_to_tag See description for sanitize
 604   * @return                  Array with modified attributes.
 605   */
 606  function fixatts($tagname, 
 607                   $attary, 
 608                   $rm_attnames,
 609                   $bad_attvals,
 610                   $add_attr_to_tag
 611                   ){
 612      $me = 'fixatts';
 613      spew("$me: Fixing attributes\n");
 614      while (list($attname, $attvalue) = each($attary)){
 615          /**
 616           * See if this attribute should be removed.
 617           */
 618          foreach ($rm_attnames as $matchtag=>$matchattrs){
 619              if (preg_match($matchtag, $tagname)){
 620                  foreach ($matchattrs as $matchattr){
 621                      if (preg_match($matchattr, $attname)){
 622                          spew("$me: Attribute '$attname' defined as bad.\n");
 623                          spew("$me: Removing.\n");
 624                          unset($attary{$attname});
 625                          continue;
 626                      }
 627                  }
 628              }
 629          }
 630          /**
 631           * Remove any backslashes, entities, or extraneous whitespace.
 632           */
 633          defang($attvalue);
 634          unspace($attvalue);
 635          
 636          /**
 637           * Now let's run checks on the attvalues.
 638           * I don't expect anyone to comprehend this. If you do,
 639           * get in touch with me so I can drive to where you live and
 640           * shake your hand personally. :)
 641           */
 642          foreach ($bad_attvals as $matchtag=>$matchattrs){
 643              if (preg_match($matchtag, $tagname)){
 644                  foreach ($matchattrs as $matchattr=>$valary){
 645                      if (preg_match($matchattr, $attname)){
 646                          /**
 647                           * There are two arrays in valary.
 648                           * First is matches.
 649                           * Second one is replacements
 650                           */
 651                          list($valmatch, $valrepl) = $valary;
 652                          $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
 653                          if ($newvalue != $attvalue){
 654                              spew("$me: attvalue is now $newvalue\n");
 655                              $attary{$attname} = $newvalue;
 656                          }
 657                      }
 658                  }
 659              }
 660          }
 661      }
 662      /**
 663       * See if we need to append any attributes to this tag.
 664       */
 665      foreach ($add_attr_to_tag as $matchtag=>$addattary){
 666          if (preg_match($matchtag, $tagname)){
 667              $attary = array_merge($attary, $addattary);
 668              spew("$me: Added attributes to this tag\n");
 669          }
 670      }
 671      return $attary;
 672  }
 673  
 674  /**
 675   * This is the main function and the one you should actually be calling.
 676   * There are several variables you should be aware of an which need
 677   * special description.
 678   *
 679   * $tag_list
 680   * ----------
 681   * This is a simple one-dimentional array of strings, except for the
 682   * very first one. The first member should be einter false or true.
 683   * In case it's FALSE, the following list will be considered a list of
 684   * tags that should be explicitly REMOVED from the body, and all
 685   * others that did not match the list will be allowed.  If the first
 686   * member is TRUE, then the list is the list of tags that should be
 687   * explicitly ALLOWED -- any tag not matching this list will be
 688   * discarded.
 689   *
 690   * Examples:
 691   * $tag_list = Array(
 692   *                   false,   
 693   *                   "blink", 
 694   *                   "link",
 695   *             "object",
 696   *             "meta",
 697   *                   "marquee",
 698   *                   "html"
 699   *                    );
 700   *
 701   * This will allow all tags except for blink, link, object, meta, marquee, 
 702   * and html.
 703   *
 704   * $tag_list = Array(
 705   *                   true, 
 706   *                   "b", 
 707   *                   "a", 
 708   *                   "i", 
 709   *                   "img", 
 710   *                   "strong", 
 711   *                   "em", 
 712   *                   "p"
 713   *                  );
 714   *
 715   * This will remove all tags from the body except b, a, i, img, strong, em and
 716   * p.
 717   *
 718   * $rm_tags_with_content
 719   * ---------------------
 720   * This is a simple one-dimentional array of strings, which specifies the
 721   * tags to be removed with any and all content between the beginning and
 722   * the end of the tag.
 723   * Example:
 724   * $rm_tags_with_content = Array(
 725   *                               "script",
 726   *                               "style", 
 727   *                               "applet",
 728   *                               "embed"
 729   *                              );
 730   *
 731   * This will remove the following structure:
 732   * <script>
 733   *  window.alert("Isn't cross-site-scripting fun?!");
 734   * </script>
 735   * 
 736   * $self_closing_tags
 737   * ------------------
 738   * This is a simple one-dimentional array of strings, which specifies which
 739   * tags contain no content and should not be forcefully closed if this option
 740   * is turned on (see further).
 741   * Example:
 742   * $self_closing_tags =  Array(
 743   *                             "img",
 744   *                             "br", 
 745   *                             "hr",
 746   *                             "input"
 747   *                            );    
 748   *
 749   * $force_tag_closing
 750   * ------------------
 751   * Set it to true to forcefully close any tags opened within the document.
 752   * This is good if you want to take care of people who like to screw up
 753   * the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
 754   *
 755   * $rm_attnames
 756   * -------------
 757   * Now we come to parameters that are more obscure. This parameter is
 758   * a nested array which is used to specify which attributes should be
 759   * removed. It goes like so:
 760   * 
 761   * $rm_attnames = Array(
 762   *   "PCRE regex to match tag name" =>
 763   *     Array(
 764   *           "PCRE regex to match attribute name"
 765   *           )
 766   *   );
 767   *
 768   * Example:
 769   * $rm_attnames = Array(
 770   *   "|.*|" =>
 771   *     Array(
 772   *           "|target|i",
 773   *           "|^on.*|i"  
 774   *          )
 775   *   );
 776   *
 777   * This will match all attributes (.*), and specify that all attributes
 778   * named "target" and starting with "on" should be removed. This will take
 779   * care of the following problem:
 780   * <em onmouseover="window.alert('muahahahaha')">
 781   * The "onmouseover" will be removed.
 782   *
 783   * $bad_attvals
 784   * ------------
 785   * This is where it gets ugly. This is a nested array with many levels.
 786   * It goes like so:
 787   *
 788   * $bad_attvals = Array(
 789   *   "pcre regex to match tag name" =>
 790   *     Array(
 791   *           "pcre regex to match attribute name" =>
 792   *             Array(
 793   *                   "pcre regex to match attribute value"
 794   *                  )
 795   *             Array(
 796   *                   "pcre regex replace a match from above with"
 797   *                  )
 798   *          )
 799   *   );
 800   *
 801   * An extensive example:
 802   *
 803   * $bad_attvals = Array(
 804   *   "|.*|" =>
 805   *      Array(
 806   *            "/^src|background|href|action/i" =>
 807   *                Array(
 808   *                      Array(
 809   *                            "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si"
 810   *                            ),
 811   *                      Array(
 812   *                            "\\1http://veryfunny.com/\\2"
 813   *                            )
 814   *                      ),
 815   *            "/^style/i" =>
 816   *                Array(
 817   *                      Array(
 818   *                            "/expression/si",
 819   *                            "/url\(([\'\"])\s*https*:.*([\'\"])\)/si",
 820   *                            "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si"
 821   *                           ),
 822   *                      Array(
 823   *                            "idiocy",
 824   *                            "url(\\1http://veryfunny.com/\\2)",
 825   *                            "url(\\1http://veryfynny.com/\\2)"
 826   *                           )
 827   *                      )
 828   *            )
 829   *  );
 830   *
 831   * This will take care of nearly all known cross-site scripting exploits,
 832   * plus some (see my filter sample at 
 833   * http://www.mricon.com/html/phpfilter.html for a working version).
 834   *
 835   * $add_attr_to_tag
 836   * ----------------
 837   * This is a useful little feature which lets you add attributes to 
 838   * certain tags. It is a nested array as well, but not at all like
 839   * the previous one. It goes like so:
 840   * 
 841   * $add_attr_to_tag = Array(
 842   *   "PCRE regex to match tag name" =>
 843   *     Array(
 844   *           "attribute name"=>'"attribute value"'
 845   *          )
 846   *   );
 847   * 
 848   * Note: don't forget quotes around attribute value.
 849   * 
 850   * Example:
 851   * 
 852   * $add_attr_to_tag = Array(
 853   *   "/^a$/si" => 
 854   *     Array(
 855   *           'target'=>'"_new"'
 856   *          )
 857   *   );
 858   * 
 859   * This will change all <a> tags and add target="_new" to them so all links
 860   * open in a new window.
 861   *
 862   *
 863   *
 864   * @param $body                 the string with HTML you wish to filter
 865   * @param $tag_list             see description above
 866   * @param $rm_tags_with_content see description above
 867   * @param $self_closing_tags    see description above
 868   * @param $force_tag_closing    see description above
 869   * @param $rm_attnames          see description above
 870   * @param $bad_attvals          see description above
 871   * @param $add_attr_to_tag      see description above
 872   * @return                      sanitized html safe to show on your pages.
 873   */
 874  function sanitize($body, 
 875                    $tag_list = array(), 
 876                    $rm_tags_with_content = array(),
 877                    $self_closing_tags = array(),
 878                    $force_tag_closing = true,
 879                    $rm_attnames = array(),
 880                    $bad_attvals = array(),
 881                    $add_attr_to_tag = array()
 882                    ){
 883      $me = 'sanitize';
 884      /**
 885       * Normalize rm_tags and rm_tags_with_content.
 886       */
 887      @array_walk($tag_list, 'casenormalize');
 888      @array_walk($rm_tags_with_content, 'casenormalize');
 889      @array_walk($self_closing_tags, 'casenormalize');
 890      /**
 891       * See if tag_list is of tags to remove or tags to allow.
 892       * false  means remove these tags
 893       * true   means allow these tags
 894       */
 895      $rm_tags = array_shift($tag_list);
 896      $curpos = 0;
 897      $open_tags = Array();
 898      $trusted = "<!-- begin sanitized html -->\n";
 899      $skip_content = false;
 900      /**
 901       * Take care of netscape's stupid javascript entities like
 902       * &{alert('boo')};
 903       */
 904      $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
 905      spew("$me: invoking the loop\n");
 906      while (($curtag = getnxtag($body, $curpos)) != FALSE){
 907          list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
 908          spew("$me: grabbing free-standing content\n");
 909          $free_content = substr($body, $curpos, $lt - $curpos);
 910          spew("$me: " . strlen($free_content) . " chars grabbed\n");
 911          if ($skip_content == false){
 912              spew("$me: appending free content to trusted.\n");
 913              $trusted .= $free_content;
 914          } else {
 915              spew("$me: Skipping free content.\n");
 916          }
 917          if ($tagname != FALSE){
 918              spew("$me: tagname is '$tagname'\n");
 919              if ($tagtype == 2){
 920                  spew("$me: This is a closing tag\n");
 921                  if ($skip_content == $tagname){
 922                      /**
 923                       * Got to the end of tag we needed to remove.
 924                       */
 925                      spew("$me: Finished removing tag with content\n");
 926                      $tagname = false;
 927                      $skip_content = false;
 928                  } else {
 929                      if ($skip_content == false){
 930                          if (isset($open_tags{$tagname}) && 
 931                              $open_tags{$tagname} > 0){
 932                              spew("$me: popping '$tagname' from open_tags\n");
 933                              $open_tags{$tagname}--;
 934                          } else {
 935                              spew("$me: '$tagname' was never opened\n");
 936                              spew("$me: removing\n");
 937                              $tagname = false;
 938                          }
 939                      } else {
 940                          spew("$me: Skipping this tag\n");
 941                      }
 942                  }
 943              } else {
 944                  /**
 945                   * $rm_tags_with_content
 946                   */
 947                  if ($skip_content == false){
 948                      /**
 949                       * See if this is a self-closing type and change
 950                       * tagtype appropriately.
 951                       */
 952                      if ($tagtype == 1
 953                          && in_array($tagname, $self_closing_tags)){
 954                          spew("$me: Self-closing tag. Changing tagtype.\n");
 955                          $tagtype = 3;
 956                      }
 957                      /**
 958                       * See if we should skip this tag and any content
 959                       * inside it.
 960                       */
 961                      if ($tagtype == 1 
 962                          && in_array($tagname, $rm_tags_with_content)){
 963                          spew("$me: removing this tag with content\n");
 964                          $skip_content = $tagname;
 965                      } else {
 966                          if (($rm_tags == false 
 967                               && in_array($tagname, $tag_list)) ||
 968                              ($rm_tags == true 
 969                               && !in_array($tagname, $tag_list))){
 970                              spew("$me: Removing this tag.\n");
 971                              $tagname = false;
 972                          } else {
 973                              if ($tagtype == 1){
 974                                  spew("$me: adding '$tagname' to open_tags\n");
 975                                  if (isset($open_tags{$tagname})){
 976                                      $open_tags{$tagname}++;
 977                                  } else {
 978                                      $open_tags{$tagname} = 1;
 979                                  }
 980                              }
 981                              /**
 982                               * This is where we run other checks.
 983                               */
 984                              if (is_array($attary) && sizeof($attary) > 0){
 985                                  $attary = fixatts($tagname,
 986                                                    $attary,
 987                                                    $rm_attnames,
 988                                                    $bad_attvals,
 989                                                    $add_attr_to_tag);
 990                              }
 991                          }
 992                      }
 993                  } else {
 994                      spew("$me: Skipping this tag\n");
 995                  }
 996              }
 997              if ($tagname != false && $skip_content == false){
 998                  spew("$me: Appending tag to trusted.\n");
 999                  $trusted .= tagprint($tagname, $attary, $tagtype);
1000              }
1001          } else {
1002              spew("$me: Removing invalid tag\n");
1003          }
1004          $curpos = $gt + 1;
1005      }
1006      spew("$me: Appending any leftover content\n");
1007      $trusted .= substr($body, $curpos, strlen($body) - $curpos);
1008      if ($force_tag_closing == true){
1009          foreach ($open_tags as $tagname=>$opentimes){
1010              while ($opentimes > 0){
1011                  spew("$me: '$tagname' left open. Closing by force.\n");
1012                  $trusted .= '</' . $tagname . '>';
1013                  $opentimes--;
1014              }
1015          }
1016          $trusted .= "\n";
1017      }
1018      $trusted .= "<!-- end sanitized html -->\n";
1019      return $trusted;
1020  }
1021  ?>


Generated: Wed Dec 30 05:55:15 2009 Cross-referenced by PHPXref 0.7