[ Index ]

PHP Cross Reference of phpwcms V1.5.0 _r431 (28.01.12)

title

Body

[close]

/include/inc_ext/htmlfilter/ -> htmlfilter.php (source)

   1  <?php
   2  /**
   3   * htmlfilter.inc
   4   * ---------------
   5   * This set of functions allows you to filter html in order to remove
   6   * any malicious tags from it. Useful in cases when you need to filter
   7   * user input for any cross-site-scripting attempts.
   8   *
   9   * Copyright (C) 2002-2004 by Duke University
  10   *
  11   * This library is free software; you can redistribute it and/or
  12   * modify it under the terms of the GNU Lesser General Public
  13   * License as published by the Free Software Foundation; either
  14   * version 2.1 of the License, or (at your option) any later version.
  15   *
  16   * This library is distributed in the hope that it will be useful,
  17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19   * Lesser General Public License for more details.
  20   *
  21   * You should have received a copy of the GNU Lesser General Public
  22   * License along with this library; if not, write to the Free Software
  23   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  
  24   * 02110-1301  USA
  25   *
  26   * @Author  Konstantin Riabitsev <icon@linux.duke.edu>
  27   * @Version 1.1 ($Date: 2005/06/30 18:06:08 $)
  28   */
  29  
  30  /**
  31   * This is a debugging function used throughout the code. To enable
  32   * debugging you have to specify a global variable called "debug" before
  33   * calling sanitize() and set it to true. 
  34   *
  35   * Note: Although insignificantly, debugging does slow you down even
  36   * when $debug is set to false. If you wish to get rid of all
  37   * debugging calls, run the following command:
  38   *
  39   * fgrep -v '//htmlfilter_debug("' htmlfilter.inc > htmlfilter.inc.new
  40   *
  41   * htmlfilter.inc.new will contain no debugging calls.
  42   *
  43   * @param  $message  A string with the message to output.
  44   * @return           void.
  45   */
  46  function htmlfilter_debug($message){
  47      return NULL;
  48      /*
  49      global $debug;
  50      if ($debug == true){
  51          echo "$message";
  52      }
  53      */
  54  }
  55  
  56  /**
  57   * This function returns the final tag out of the tag name, an array
  58   * of attributes, and the type of the tag. This function is called by 
  59   * sanitize internally.
  60   *
  61   * @param  $tagname  the name of the tag.
  62   * @param  $attary   the array of attributes and their values
  63   * @param  $tagtype  The type of the tag (see in comments).
  64   * @return           a string with the final tag representation.
  65   */
  66  function tagprint($tagname, $attary, $tagtype){
  67      $me = 'tagprint';
  68      if ($tagtype == 2){
  69          $fulltag = '</' . $tagname . '>';
  70      } else {
  71          $fulltag = '<' . $tagname;
  72          if (is_array($attary) && sizeof($attary)){
  73              $atts = array();
  74              while (list($attname, $attvalue) = each($attary)){
  75                  array_push($atts, "$attname=$attvalue");
  76              }
  77              $fulltag .= ' ' . join(' ', $atts);
  78          }
  79          if ($tagtype == 3){
  80              $fulltag .= ' /';
  81          }
  82          $fulltag .= '>';
  83      }
  84      //htmlfilter_debug("$me: $fulltag\n");
  85      return $fulltag;
  86  }
  87  
  88  /**
  89   * A small helper function to use with array_walk. Modifies a by-ref
  90   * value and makes it lowercase.
  91   *
  92   * @param  $val a value passed by-ref.
  93   * @return      void since it modifies a by-ref value.
  94   */
  95  function casenormalize(&$val){
  96      $val = strtolower($val);
  97  }
  98  
  99  /**
 100   * This function skips any whitespace from the current position within
 101   * a string and to the next non-whitespace value.
 102   * 
 103   * @param  $body   the string
 104   * @param  $offset the offset within the string where we should start
 105   *                 looking for the next non-whitespace character.
 106   * @return         the location within the $body where the next
 107   *                 non-whitespace char is located.
 108   */
 109  function skipspace($body, $offset){
 110      $me = 'skipspace';
 111      preg_match('/^(\s*)/s', substr($body, $offset), $matches);
 112      if (sizeof($matches{1})){
 113          $count = strlen($matches{1});
 114          //htmlfilter_debug("$me: skipped $count chars\n");
 115          $offset += $count;
 116      }
 117      return $offset;
 118  }
 119  
 120  /**
 121   * This function looks for the next character within a string.  It's
 122   * really just a glorified "strpos", except it catches the failures
 123   * nicely.
 124   *
 125   * @param  $body   The string to look for needle in.
 126   * @param  $offset Start looking from this position.
 127   * @param  $needle The character/string to look for.
 128   * @return         location of the next occurance of the needle, or
 129   *                 strlen($body) if needle wasn't found.
 130   */
 131  function findnxstr($body, $offset, $needle){
 132      $me = 'findnxstr';
 133      $pos = strpos($body, $needle, $offset);
 134      if ($pos === FALSE){
 135          $pos = strlen($body);
 136          //htmlfilter_debug("$me: end of body reached\n");
 137      }
 138      //htmlfilter_debug("$me: '$needle' found at pos $pos\n");
 139      return $pos;
 140  }
 141  
 142  /**
 143   * This function takes a PCRE-style regexp and tries to match it
 144   * within the string.
 145   *
 146   * @param  $body   The string to look for needle in.
 147   * @param  $offset Start looking from here.
 148   * @param  $reg    A PCRE-style regex to match.
 149   * @return         Returns a false if no matches found, or an array
 150   *                 with the following members:
 151   *                 - integer with the location of the match within $body
 152   *                 - string with whatever content between offset and the match
 153   *                 - string with whatever it is we matched
 154   */
 155  function findnxreg($body, $offset, $reg){
 156      $me = 'findnxreg';
 157      $matches = array();
 158      $retarr = array();
 159      $preg_rule = '%^(.*?)(' . $reg . ')%s';
 160      preg_match($preg_rule, substr($body, $offset), $matches);
 161      if (!isset($matches{0})){
 162          //htmlfilter_debug("$me: No matches found.\n");
 163          $retarr = false;
 164      } else {
 165          $retarr{0} = $offset + strlen($matches{1});
 166          $retarr{1} = $matches{1};
 167          $retarr{2} = $matches{2};
 168          //htmlfilter_debug("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
 169      }
 170      return $retarr;
 171  }
 172  
 173  /**
 174   * This function looks for the next tag.
 175   *
 176   * @param  $body   String where to look for the next tag.
 177   * @param  $offset Start looking from here.
 178   * @return         false if no more tags exist in the body, or
 179   *                 an array with the following members:
 180   *                 - string with the name of the tag
 181   *                 - array with attributes and their values
 182   *                 - integer with tag type (1, 2, or 3)
 183   *                 - integer where the tag starts (starting "<")
 184   *                 - integer where the tag ends (ending ">")
 185   *                 first three members will be false, if the tag is invalid.
 186   */
 187  function getnxtag($body, $offset){
 188      $me = 'getnxtag';
 189      if ($offset > strlen($body)){
 190          //htmlfilter_debug("$me: Past the end of body\n");
 191          return false;
 192      }
 193      $lt = findnxstr($body, $offset, '<');
 194      if ($lt == strlen($body)){
 195          //htmlfilter_debug("$me: No more tags found!\n");
 196          return false;
 197      }
 198      /**
 199       * We are here:
 200       * blah blah <tag attribute="value">
 201       * \---------^
 202       */
 203      //htmlfilter_debug("$me: Found '<' at pos $lt\n");
 204      $pos = skipspace($body, $lt + 1);
 205      if ($pos >= strlen($body)){
 206          //htmlfilter_debug("$me: End of body reached.\n");
 207          return array(false, false, false, $lt, strlen($body));
 208      }
 209      /**
 210       * There are 3 kinds of tags:
 211       * 1. Opening tag, e.g.:
 212       *    <a href="blah">
 213       * 2. Closing tag, e.g.:
 214       *    </a>
 215       * 3. XHTML-style content-less tag, e.g.:
 216       *    <img src="blah"/>
 217       */
 218      $tagtype = false;
 219      switch (substr($body, $pos, 1)){
 220      case '/':
 221          //htmlfilter_debug("$me: This is a closing tag (type 2)\n");
 222          $tagtype = 2;
 223          $pos++;
 224          break;
 225      case '!':
 226          /**
 227           * A comment or an SGML declaration.
 228           */
 229          if (substr($body, $pos+1, 2) == '--'){
 230              //htmlfilter_debug("$me: A comment found. Stripping.\n");
 231              $gt = strpos($body, '-->', $pos);
 232              if ($gt === false){
 233                  $gt = strlen($body);
 234              } else {
 235                  $gt += 2;
 236              }
 237              return array(false, false, false, $lt, $gt);
 238          } else {
 239              //htmlfilter_debug("$me: An SGML declaration found. Stripping.\n");
 240              $gt = findnxstr($body, $pos, '>');
 241              return array(false, false, false, $lt, $gt);
 242          }
 243          break;
 244      default:
 245          /**
 246           * Assume tagtype 1 for now. If it's type 3, we'll switch values
 247           * later.
 248           */
 249          $tagtype = 1;
 250          break;
 251      }
 252      
 253      $tag_start = $pos;
 254      $tagname = '';
 255      /**
 256       * Look for next [\W-_], which will indicate the end of the tag name.
 257       */
 258      $regary = findnxreg($body, $pos, '[^\w\-_]');
 259      if ($regary == false){
 260          //htmlfilter_debug("$me: End of body reached while analyzing tag name\n");
 261          return array(false, false, false, $lt, strlen($body));
 262      }
 263      list($pos, $tagname, $match) = $regary;
 264      $tagname = strtolower($tagname);
 265      
 266      /**
 267       * $match can be either of these:
 268       * '>'  indicating the end of the tag entirely.
 269       * '\s' indicating the end of the tag name.
 270       * '/'  indicating that this is type-3 xhtml tag.
 271       * 
 272       * Whatever else we find there indicates an invalid tag.
 273       */
 274      switch ($match){
 275      case '/':
 276          /**
 277           * This is an xhtml-style tag with a closing / at the
 278           * end, like so: <img src="blah"/>. Check if it's followed
 279           * by the closing bracket. If not, then this tag is invalid
 280           */
 281          if (substr($body, $pos, 2) == '/>'){
 282              //htmlfilter_debug("$me: XHTML-style tag found.\n");
 283              $pos++;
 284              //htmlfilter_debug("$me: Setting tagtype to 3\n");
 285              $tagtype = 3;
 286          } else {
 287              //htmlfilter_debug("$me: Found invalid character '/'.\n");
 288              $gt = findnxstr($body, $pos, '>');
 289              //htmlfilter_debug("$me: Tag is invalid. Returning.\n");
 290              $retary = array(false, false, false, $lt, $gt);
 291              return $retary;
 292          }
 293      case '>':
 294          //htmlfilter_debug("$me: End of tag found at $pos\n");
 295          //htmlfilter_debug("$me: Tagname is '$tagname'\n");
 296          //htmlfilter_debug("$me: This tag has no attributes\n");
 297          return array($tagname, false, $tagtype, $lt, $pos);
 298          break;
 299      default:
 300          /**
 301           * Check if it's whitespace
 302           */
 303          if (preg_match('/\s/', $match)){
 304              //htmlfilter_debug("$me: Tagname is '$tagname'\n");
 305          } else {
 306              /**
 307               * This is an invalid tag! Look for the next closing ">".
 308               */
 309              //htmlfilter_debug("$me: Invalid characters found in tag name: $match\n");
 310              $gt = findnxstr($body, $lt, '>');
 311              return array(false, false, false, $lt, $gt);
 312          }
 313      }
 314      
 315      /**
 316       * At this point we're here:
 317       * <tagname  attribute='blah'>
 318       * \-------^
 319       *
 320       * At this point we loop in order to find all attributes.
 321       */
 322      $attname = '';
 323      $atttype = false;
 324      $attary = array();
 325      
 326      while ($pos <= strlen($body)){
 327          $pos = skipspace($body, $pos);
 328          if ($pos == strlen($body)){
 329              /**
 330               * Non-closed tag.
 331               */
 332              //htmlfilter_debug("$me: End of body reached before end of tag. Discarding.\n");
 333              return array(false, false, false, $lt, $pos);
 334          }
 335          /**
 336           * See if we arrived at a ">" or "/>", which means that we reached
 337           * the end of the tag.
 338           */
 339          $matches = array();
 340          preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches);
 341          if (isset($matches{0}) && $matches{0}){
 342              /**
 343               * Yep. So we did.
 344               */
 345              //htmlfilter_debug("$me: Arrived at the end of the tag.\n");
 346              $pos += strlen($matches{1});
 347              if ($matches{2} == '/>'){
 348                  $tagtype = 3;
 349                  $pos++;
 350              }
 351              return array($tagname, $attary, $tagtype, $lt, $pos);
 352          }
 353          
 354          /**
 355           * There are several types of attributes, with optional
 356           * [:space:] between members.
 357           * Type 1:
 358           *   attrname[:space:]=[:space:]'CDATA'
 359           * Type 2:
 360           *   attrname[:space:]=[:space:]"CDATA"
 361           * Type 3:
 362           *   attr[:space:]=[:space:]CDATA
 363           * Type 4:
 364           *   attrname
 365           *
 366           * We leave types 1 and 2 the same, type 3 we check for
 367           * '"' and convert to "&quot" if needed, then wrap in
 368           * double quotes. Type 4 we convert into:
 369           * attrname="yes".
 370           */
 371          $regary = findnxreg($body, $pos, '[^\w\-_]');
 372          if ($regary == false){
 373              /**
 374               * Looks like body ended before the end of tag.
 375               */
 376              //htmlfilter_debug("$me: End of body found before end of tag.\n");
 377              //htmlfilter_debug("$me: Invalid, returning\n");
 378              return array(false, false, false, $lt, strlen($body));
 379          }
 380          list($pos, $attname, $match) = $regary;
 381          $attname = strtolower($attname);
 382          //htmlfilter_debug("$me: Attribute '$attname' found\n");
 383          /**
 384           * We arrived at the end of attribute name. Several things possible
 385           * here:
 386           * '>'  means the end of the tag and this is attribute type 4
 387           * '/'  if followed by '>' means the same thing as above
 388           * '\s' means a lot of things -- look what it's followed by.
 389           *      anything else means the attribute is invalid.
 390           */
 391          switch($match){
 392          case '/':
 393              /**
 394               * This is an xhtml-style tag with a closing / at the
 395               * end, like so: <img src="blah"/>. Check if it's followed
 396               * by the closing bracket. If not, then this tag is invalid
 397               */
 398              if (substr($body, $pos, 2) == '/>'){
 399                  //htmlfilter_debug("$me: This is an xhtml-style tag.\n");
 400                  $pos++;
 401                  //htmlfilter_debug("$me: Setting tagtype to 3\n");
 402                  $tagtype = 3;
 403              } else {
 404                  //htmlfilter_debug("$me: Found invalid character '/'.\n");
 405                  $gt = findnxstr($body, $pos, '>');
 406                  //htmlfilter_debug("$me: Tag is invalid. Returning.\n");
 407                  $retary = array(false, false, false, $lt, $gt);
 408                  return $retary;
 409              }
 410          case '>':
 411              //htmlfilter_debug("$me: found type 4 attribute.\n");
 412              //htmlfilter_debug("$me: Additionally, end of tag found at $pos\n");
 413              //htmlfilter_debug("$me: Attname is '$attname'\n");
 414              //htmlfilter_debug("$me: Setting attvalue to 'yes'\n");
 415              $attary{$attname} = '"yes"';
 416              return array($tagname, $attary, $tagtype, $lt, $pos);
 417              break;
 418          default:
 419              /**
 420               * Skip whitespace and see what we arrive at.
 421               */
 422              $pos = skipspace($body, $pos);
 423              $char = substr($body, $pos, 1);
 424              /**
 425               * Two things are valid here:
 426               * '=' means this is attribute type 1 2 or 3.
 427               * \w means this was attribute type 4.
 428               * anything else we ignore and re-loop. End of tag and
 429               * invalid stuff will be caught by our checks at the beginning
 430               * of the loop.
 431               */
 432              if ($char == '='){
 433                  //htmlfilter_debug("$me: Attribute type 1, 2, or 3 found.\n");
 434                  $pos++;
 435                  $pos = skipspace($body, $pos);
 436                  /**
 437                   * Here are 3 possibilities:
 438                   * "'"  attribute type 1
 439                   * '"'  attribute type 2
 440                   * everything else is the content of tag type 3
 441                   */
 442                  $quot = substr($body, $pos, 1);
 443                  if ($quot == '\''){
 444                      //htmlfilter_debug("$me: In fact, this is attribute type 1\n");
 445                      //htmlfilter_debug("$me: looking for closing quote\n");
 446                      $regary = findnxreg($body, $pos+1, '\'');
 447                      if ($regary == false){
 448                          //htmlfilter_debug("$me: end of body reached before end of val\n");
 449                          //htmlfilter_debug("$me: Returning\n");
 450                          return array(false, false, false, $lt, strlen($body));
 451                      }
 452                      list($pos, $attval, $match) = $regary;
 453                      //htmlfilter_debug("$me: Attvalue is '$attval'\n");
 454                      $pos++;
 455                      $attary{$attname} = '\'' . $attval . '\'';
 456                  } else if ($quot == '"'){
 457                      //htmlfilter_debug("$me: In fact, this is attribute type 2\n");
 458                      //htmlfilter_debug("$me: looking for closing quote\n");
 459                      $regary = findnxreg($body, $pos+1, '\"');
 460                      if ($regary == false){
 461                          //htmlfilter_debug("$me: end of body reached before end of val\n");
 462                          //htmlfilter_debug("$me: Returning\n");
 463                          return array(false, false, false, $lt, strlen($body));
 464                      }
 465                      list($pos, $attval, $match) = $regary;
 466                      //htmlfilter_debug("$me: Attvalue is \"$attval\"\n");
 467                      $pos++;
 468                      $attary{$attname} = '"' . $attval . '"';
 469                  } else {
 470                      //htmlfilter_debug("$me: This looks like attribute type 3\n");
 471                      /**
 472                       * These are hateful. Look for \s, or >.
 473                       */
 474                      //htmlfilter_debug("$me: Looking for end of attval\n");
 475                      $regary = findnxreg($body, $pos, '[\s>]');
 476                      if ($regary == false){
 477                          //htmlfilter_debug("$me: end of body reached before end of val\n");
 478                          //htmlfilter_debug("$me: Returning\n");
 479                          return array(false, false, false, $lt, strlen($body));
 480                      }
 481                      list($pos, $attval, $match) = $regary;
 482                      /**
 483                       * If it's ">" it will be caught at the top.
 484                       */
 485                      //htmlfilter_debug("$me: translating '\"' into &quot;\n");
 486                      $attval = preg_replace('/\"/s', '&quot;', $attval);
 487                      //htmlfilter_debug("$me: wrapping in quotes\n");
 488                      $attary{$attname} = '"' . $attval . '"';
 489                  }
 490              } else if (preg_match('|[\w/>]|', $char)) {
 491                  /**
 492                   * That was attribute type 4.
 493                   */
 494                  //htmlfilter_debug("$me: attribute type 4 found.\n");
 495                  //htmlfilter_debug("$me: Setting value to 'yes'\n");
 496                  $attary{$attname} = '"yes"';
 497              } else {
 498                  /**
 499                   * An illegal character. Find next '>' and return.
 500                   */
 501                  //htmlfilter_debug("$me: illegal character '$char' found.\n");
 502                  //htmlfilter_debug("$me: returning\n");
 503                  $gt = findnxstr($body, $pos, '>');
 504                  return array(false, false, false, $lt, $gt);
 505              }
 506          }
 507      }
 508      /**
 509       * The fact that we got here indicates that the tag end was never
 510       * found. Return invalid tag indication so it gets stripped.
 511       */
 512      //htmlfilter_debug("$me: No tag end found\n");
 513      return array(false, false, false, $lt, strlen($body));
 514  }
 515  
 516  /**
 517   * Translates entities into literal values so they can be checked.
 518   *
 519   * @param $attvalue the by-ref value to check.
 520   * @param $regex    the regular expression to check against.
 521   * @param $hex      whether the entites are hexadecimal.
 522   * @return          True or False depending on whether there were matches.
 523   */
 524  function deent(&$attvalue, $regex, $hex=false){
 525      $me = 'deent';
 526      //htmlfilter_debug("$me: matching '$regex' against: $attvalue\n");
 527      $ret_match = false;
 528      preg_match_all($regex, $attvalue, $matches);
 529      if (is_array($matches) && sizeof($matches[0]) > 0){
 530          //htmlfilter_debug("$me: found " . sizeof($matches[0]) . " matches\n");
 531          $repl = array();
 532          for ($i = 0; $i < sizeof($matches[0]); $i++){
 533              $numval = $matches[1][$i];
 534              //htmlfilter_debug("$me: numval is $numval\n");
 535              if ($hex){
 536                  $numval = hexdec($numval);
 537                  //htmlfilter_debug("$me: hex! Numval is now $numval\n");
 538              }
 539              $repl{$matches[0][$i]} = chr($numval);
 540          }
 541          $attvalue = strtr($attvalue, $repl);
 542          //htmlfilter_debug("$me: attvalue after translation: $attvalue\n");
 543          return true;
 544      } else {
 545          //htmlfilter_debug("$me: no matches! Returning false.\n");
 546          return false;
 547      }
 548  }
 549  
 550  /**
 551   * This function checks attribute values for entity-encoded values
 552   * and returns them translated into 8-bit strings so we can run
 553   * checks on them.
 554   *
 555   * @param  $attvalue A string to run entity check against.
 556   * @return           Nothing, modifies a reference value.
 557   */
 558  function defang(&$attvalue){
 559      $me = 'defang';
 560      /**
 561       * Skip this if there aren't ampersands or backslashes.
 562       */
 563      //htmlfilter_debug("$me: Checking '$attvalue' for suspicious content\n");
 564      if (strpos($attvalue, '&') === false
 565          && strpos($attvalue, '\\') === false){
 566          //htmlfilter_debug("$me: no suspicious content found, returning.\n");
 567          return;
 568      }
 569      $m = false;
 570      do {
 571          $m = false;
 572          $m = $m || deent($attvalue, '/\&#0*(\d+);*/s');
 573          $m = $m || deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
 574          $m = $m || deent($attvalue, '/\\\\(\d+)/s', true);
 575          //htmlfilter_debug("$me: m=$m\n");
 576      } while ($m == true);
 577      $attvalue = stripslashes($attvalue);
 578      //htmlfilter_debug("$me: translated into: $attvalue\n");
 579  }
 580  
 581  /**
 582   * Kill any tabs, newlines, or carriage returns. Our friends the
 583   * makers of the browser with 95% market value decided that it'd
 584   * be funny to make "java[tab]script" be just as good as "javascript".
 585   * 
 586   * @param  attvalue  The attribute value before extraneous spaces removed.
 587   * @return attvalue  Nothing, modifies a reference value.
 588   */
 589  function unspace(&$attvalue){
 590      $me = 'unspace';
 591      if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
 592          //htmlfilter_debug("$me: Killing whitespace.\n");
 593          $attvalue = str_replace(array("\t", "\r", "\n", "\0", " "), 
 594                                  array('',   '',   '',   '',   ''), $attvalue);
 595      }
 596      //htmlfilter_debug("$me: after unspace: $attvalue\n");
 597  }
 598  
 599  /**
 600   * This function runs various checks against the attributes.
 601   *
 602   * @param  $tagname         String with the name of the tag.
 603   * @param  $attary          Array with all tag attributes.
 604   * @param  $rm_attnames     See description for sanitize
 605   * @param  $bad_attvals     See description for sanitize
 606   * @param  $add_attr_to_tag See description for sanitize
 607   * @return                  Array with modified attributes.
 608   */
 609  function fixatts($tagname, 
 610                   $attary, 
 611                   $rm_attnames,
 612                   $bad_attvals,
 613                   $add_attr_to_tag
 614                   ){
 615      $me = 'fixatts';
 616      //htmlfilter_debug("$me: Fixing attributes\n");
 617      while (list($attname, $attvalue) = each($attary)){
 618          /**
 619           * See if this attribute should be removed.
 620           */
 621          foreach ($rm_attnames as $matchtag=>$matchattrs){
 622              if (preg_match($matchtag, $tagname)){
 623                  foreach ($matchattrs as $matchattr){
 624                      if (preg_match($matchattr, $attname)){
 625                          //htmlfilter_debug("$me: Attribute '$attname' defined as bad.\n");
 626                          //htmlfilter_debug("$me: Removing.\n");
 627                          unset($attary{$attname});
 628                          continue;
 629                      }
 630                  }
 631              }
 632          }
 633          /**
 634           * Remove any backslashes, entities, or extraneous whitespace.
 635           */
 636          defang($attvalue);
 637          unspace($attvalue);
 638          
 639          /**
 640           * Now let's run checks on the attvalues.
 641           * I don't expect anyone to comprehend this. If you do,
 642           * get in touch with me so I can drive to where you live and
 643           * shake your hand personally. :)
 644           */
 645          foreach ($bad_attvals as $matchtag=>$matchattrs){
 646              if (preg_match($matchtag, $tagname)){
 647                  foreach ($matchattrs as $matchattr=>$valary){
 648                      if (preg_match($matchattr, $attname)){
 649                          /**
 650                           * There are two arrays in valary.
 651                           * First is matches.
 652                           * Second one is replacements
 653                           */
 654                          list($valmatch, $valrepl) = $valary;
 655                          $newvalue = preg_replace($valmatch,$valrepl,$attvalue);
 656                          if ($newvalue != $attvalue){
 657                              //htmlfilter_debug("$me: attvalue is now $newvalue\n");
 658                              $attary{$attname} = $newvalue;
 659                          }
 660                      }
 661                  }
 662              }
 663          }
 664      }
 665      /**
 666       * See if we need to append any attributes to this tag.
 667       */
 668      foreach ($add_attr_to_tag as $matchtag=>$addattary){
 669          if (preg_match($matchtag, $tagname)){
 670              $attary = array_merge($attary, $addattary);
 671              //htmlfilter_debug("$me: Added attributes to this tag\n");
 672          }
 673      }
 674      return $attary;
 675  }
 676  
 677  /**
 678   * This is the main function and the one you should actually be calling.
 679   * There are several variables you should be aware of an which need
 680   * special description.
 681   *
 682   * $tag_list
 683   * ----------
 684   * This is a simple one-dimentional array of strings, except for the
 685   * very first one. The first member should be einter false or true.
 686   * In case it's FALSE, the following list will be considered a list of
 687   * tags that should be explicitly REMOVED from the body, and all
 688   * others that did not match the list will be allowed.  If the first
 689   * member is TRUE, then the list is the list of tags that should be
 690   * explicitly ALLOWED -- any tag not matching this list will be
 691   * discarded.
 692   *
 693   * Examples:
 694   * $tag_list = array(
 695   *                   false,   
 696   *                   "blink", 
 697   *                   "link",
 698   *             "object",
 699   *             "meta",
 700   *                   "marquee",
 701   *                   "html"
 702   *                    );
 703   *
 704   * This will allow all tags except for blink, link, object, meta, marquee, 
 705   * and html.
 706   *
 707   * $tag_list = array(
 708   *                   true, 
 709   *                   "b", 
 710   *                   "a", 
 711   *                   "i", 
 712   *                   "img", 
 713   *                   "strong", 
 714   *                   "em", 
 715   *                   "p"
 716   *                  );
 717   *
 718   * This will remove all tags from the body except b, a, i, img, strong, em and
 719   * p.
 720   *
 721   * $rm_tags_with_content
 722   * ---------------------
 723   * This is a simple one-dimentional array of strings, which specifies the
 724   * tags to be removed with any and all content between the beginning and
 725   * the end of the tag.
 726   * Example:
 727   * $rm_tags_with_content = array(
 728   *                               "script",
 729   *                               "style", 
 730   *                               "applet",
 731   *                               "embed"
 732   *                              );
 733   *
 734   * This will remove the following structure:
 735   * <script>
 736   *  window.alert("Isn't cross-site-scripting fun?!");
 737   * </script>
 738   * 
 739   * $self_closing_tags
 740   * ------------------
 741   * This is a simple one-dimentional array of strings, which specifies which
 742   * tags contain no content and should not be forcefully closed if this option
 743   * is turned on (see further).
 744   * Example:
 745   * $self_closing_tags =  array(
 746   *                             "img",
 747   *                             "br", 
 748   *                             "hr",
 749   *                             "input"
 750   *                            );    
 751   *
 752   * $force_tag_closing
 753   * ------------------
 754   * Set it to true to forcefully close any tags opened within the document.
 755   * This is good if you want to take care of people who like to screw up
 756   * the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
 757   *
 758   * $rm_attnames
 759   * -------------
 760   * Now we come to parameters that are more obscure. This parameter is
 761   * a nested array which is used to specify which attributes should be
 762   * removed. It goes like so:
 763   * 
 764   * $rm_attnames = array(
 765   *   "PCRE regex to match tag name" =>
 766   *     array(
 767   *           "PCRE regex to match attribute name"
 768   *           )
 769   *   );
 770   *
 771   * Example:
 772   * $rm_attnames = array(
 773   *   "|.*|" =>
 774   *     array(
 775   *           "|target|i",
 776   *           "|^on.*|i"  
 777   *          )
 778   *   );
 779   *
 780   * This will match all attributes (.*), and specify that all attributes
 781   * named "target" and starting with "on" should be removed. This will take
 782   * care of the following problem:
 783   * <em onmouseover="window.alert('muahahahaha')">
 784   * The "onmouseover" will be removed.
 785   *
 786   * $bad_attvals
 787   * ------------
 788   * This is where it gets ugly. This is a nested array with many levels.
 789   * It goes like so:
 790   *
 791   * $bad_attvals = array(
 792   *   "pcre regex to match tag name" =>
 793   *     array(
 794   *           "pcre regex to match attribute name" =>
 795   *             array(
 796   *                   "pcre regex to match attribute value"
 797   *                  )
 798   *             array(
 799   *                   "pcre regex replace a match from above with"
 800   *                  )
 801   *          )
 802   *   );
 803   *
 804   * An extensive example:
 805   *
 806   * $bad_attvals = array(
 807   *   "|.*|" =>
 808   *      array(
 809   *            "/^src|background|href|action/i" =>
 810   *                array(
 811   *                      array(
 812   *                            "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si"
 813   *                            ),
 814   *                      array(
 815   *                            "\\1http://veryfunny.com/\\2"
 816   *                            )
 817   *                      ),
 818   *            "/^style/i" =>
 819   *                array(
 820   *                      array(
 821   *                            "/expression/si",
 822   *                            "/url\(([\'\"])\s*https*:.*([\'\"])\)/si",
 823   *                            "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si"
 824   *                           ),
 825   *                      array(
 826   *                            "idiocy",
 827   *                            "url(\\1http://veryfunny.com/\\2)",
 828   *                            "url(\\1http://veryfynny.com/\\2)"
 829   *                           )
 830   *                      )
 831   *            )
 832   *  );
 833   *
 834   * This will take care of nearly all known cross-site scripting exploits,
 835   * plus some (see my filter sample at 
 836   * http://www.mricon.com/html/phpfilter.html for a working version).
 837   *
 838   * $add_attr_to_tag
 839   * ----------------
 840   * This is a useful little feature which lets you add attributes to 
 841   * certain tags. It is a nested array as well, but not at all like
 842   * the previous one. It goes like so:
 843   * 
 844   * $add_attr_to_tag = array(
 845   *   "PCRE regex to match tag name" =>
 846   *     array(
 847   *           "attribute name"=>'"attribute value"'
 848   *          )
 849   *   );
 850   * 
 851   * Note: don't forget quotes around attribute value.
 852   * 
 853   * Example:
 854   * 
 855   * $add_attr_to_tag = array(
 856   *   "/^a$/si" => 
 857   *     array(
 858   *           'target'=>'"_new"'
 859   *          )
 860   *   );
 861   * 
 862   * This will change all <a> tags and add target="_new" to them so all links
 863   * open in a new window.
 864   *
 865   *
 866   *
 867   * @param $body                 the string with HTML you wish to filter
 868   * @param $tag_list             see description above
 869   * @param $rm_tags_with_content see description above
 870   * @param $self_closing_tags    see description above
 871   * @param $force_tag_closing    see description above
 872   * @param $rm_attnames          see description above
 873   * @param $bad_attvals          see description above
 874   * @param $add_attr_to_tag      see description above
 875   * @return                      sanitized html safe to show on your pages.
 876   */
 877  function htmlfilter_sanitize($body, 
 878                    $tag_list = array(), 
 879                    $rm_tags_with_content = array(),
 880                    $self_closing_tags = array(),
 881                    $force_tag_closing = true,
 882                    $rm_attnames = array(),
 883                    $bad_attvals = array(),
 884                    $add_attr_to_tag = array()
 885                    ){
 886      $me = 'sanitize';
 887      /**
 888       * Normalize rm_tags and rm_tags_with_content.
 889       */
 890      @array_walk($tag_list, 'casenormalize');
 891      @array_walk($rm_tags_with_content, 'casenormalize');
 892      @array_walk($self_closing_tags, 'casenormalize');
 893      /**
 894       * See if tag_list is of tags to remove or tags to allow.
 895       * false  means remove these tags
 896       * true   means allow these tags
 897       */
 898      $rm_tags = array_shift($tag_list);
 899      $curpos = 0;
 900      $open_tags = array();
 901      $trusted = ''; //"<!-- begin sanitized html -->\n";
 902      $skip_content = false;
 903      /**
 904       * Take care of netscape's stupid javascript entities like
 905       * &{alert('boo')};
 906       */
 907      $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', trim($body));
 908      //htmlfilter_debug("$me: invoking the loop\n");
 909      while (($curtag = getnxtag($body, $curpos)) != FALSE){
 910          list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
 911          //htmlfilter_debug("$me: grabbing free-standing content\n");
 912          $free_content = substr($body, $curpos, $lt - $curpos);
 913          //htmlfilter_debug("$me: " . strlen($free_content) . " chars grabbed\n");
 914          if ($skip_content == false){
 915              //htmlfilter_debug("$me: appending free content to trusted.\n");
 916              $trusted .= $free_content;
 917          } else {
 918              //htmlfilter_debug("$me: Skipping free content.\n");
 919          }
 920          if ($tagname != FALSE){
 921              //htmlfilter_debug("$me: tagname is '$tagname'\n");
 922              if ($tagtype == 2){
 923                  //htmlfilter_debug("$me: This is a closing tag\n");
 924                  if ($skip_content == $tagname){
 925                      /**
 926                       * Got to the end of tag we needed to remove.
 927                       */
 928                      //htmlfilter_debug("$me: Finished removing tag with content\n");
 929                      $tagname = false;
 930                      $skip_content = false;
 931                  } else {
 932                      if ($skip_content == false){
 933                          if (isset($open_tags{$tagname}) && 
 934                              $open_tags{$tagname} > 0){
 935                              //htmlfilter_debug("$me: popping '$tagname' from open_tags\n");
 936                              $open_tags{$tagname}--;
 937                          } else {
 938                              //htmlfilter_debug("$me: '$tagname' was never opened\n");
 939                              //htmlfilter_debug("$me: removing\n");
 940                              $tagname = false;
 941                          }
 942                      } else {
 943                          //htmlfilter_debug("$me: Skipping this tag\n");
 944                      }
 945                  }
 946              } else {
 947                  /**
 948                   * $rm_tags_with_content
 949                   */
 950                  if ($skip_content == false){
 951                      /**
 952                       * See if this is a self-closing type and change
 953                       * tagtype appropriately.
 954                       */
 955                      if ($tagtype == 1
 956                          && in_array($tagname, $self_closing_tags)){
 957                          //htmlfilter_debug("$me: Self-closing tag. Changing tagtype.\n");
 958                          $tagtype = 3;
 959                      }
 960                      /**
 961                       * See if we should skip this tag and any content
 962                       * inside it.
 963                       */
 964                      if ($tagtype == 1 
 965                          && in_array($tagname, $rm_tags_with_content)){
 966                          //htmlfilter_debug("$me: removing this tag with content\n");
 967                          $skip_content = $tagname;
 968                      } else {
 969                          if (($rm_tags == false 
 970                               && in_array($tagname, $tag_list)) ||
 971                              ($rm_tags == true 
 972                               && !in_array($tagname, $tag_list))){
 973                              //htmlfilter_debug("$me: Removing this tag.\n");
 974                              $tagname = false;
 975                          } else {
 976                              if ($tagtype == 1){
 977                                  //htmlfilter_debug("$me: adding '$tagname' to open_tags\n");
 978                                  if (isset($open_tags{$tagname})){
 979                                      $open_tags{$tagname}++;
 980                                  } else {
 981                                      $open_tags{$tagname} = 1;
 982                                  }
 983                              }
 984                              /**
 985                               * This is where we run other checks.
 986                               */
 987                              if (is_array($attary) && sizeof($attary) > 0){
 988                                  $attary = fixatts($tagname,
 989                                                    $attary,
 990                                                    $rm_attnames,
 991                                                    $bad_attvals,
 992                                                    $add_attr_to_tag);
 993                              }
 994                          }
 995                      }
 996                  } else {
 997                      //htmlfilter_debug("$me: Skipping this tag\n");
 998                  }
 999              }
1000              if ($tagname != false && $skip_content == false){
1001                  //htmlfilter_debug("$me: Appending tag to trusted.\n");
1002                  $trusted .= tagprint($tagname, $attary, $tagtype);
1003              }
1004          } else {
1005              //htmlfilter_debug("$me: Removing invalid tag\n");
1006          }
1007          $curpos = $gt + 1;
1008      }
1009      //htmlfilter_debug("$me: Appending any leftover content\n");
1010      $trusted .= substr($body, $curpos, strlen($body) - $curpos);
1011      if ($force_tag_closing == true){
1012          foreach ($open_tags as $tagname=>$opentimes){
1013              while ($opentimes > 0){
1014                  //htmlfilter_debug("$me: '$tagname' left open. Closing by force.\n");
1015                  $trusted .= '</' . $tagname . '>';
1016                  $opentimes--;
1017              }
1018          }
1019          $trusted .= " ";
1020      }
1021      // $trusted .= "<!-- end sanitized html -->\n";
1022      return $trusted;
1023  }
1024  ?>


Generated: Sun Jan 29 16:31:14 2012 Cross-referenced by PHPXref 0.7.1