[ Index ] |
PHP Cross Reference of phpwcms V1.5.0 _r431 (28.01.12) |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * htmlfilter.inc 4 * --------------- 5 * This set of functions allows you to filter html in order to remove 6 * any malicious tags from it. Useful in cases when you need to filter 7 * user input for any cross-site-scripting attempts. 8 * 9 * Copyright (C) 2002-2004 by Duke University 10 * 11 * This library is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with this library; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 24 * 02110-1301 USA 25 * 26 * @Author Konstantin Riabitsev <icon@linux.duke.edu> 27 * @Version 1.1 ($Date: 2005/06/30 18:06:08 $) 28 */ 29 30 /** 31 * This is a debugging function used throughout the code. To enable 32 * debugging you have to specify a global variable called "debug" before 33 * calling sanitize() and set it to true. 34 * 35 * Note: Although insignificantly, debugging does slow you down even 36 * when $debug is set to false. If you wish to get rid of all 37 * debugging calls, run the following command: 38 * 39 * fgrep -v '//htmlfilter_debug("' htmlfilter.inc > htmlfilter.inc.new 40 * 41 * htmlfilter.inc.new will contain no debugging calls. 42 * 43 * @param $message A string with the message to output. 44 * @return void. 45 */ 46 function htmlfilter_debug($message){ 47 return NULL; 48 /* 49 global $debug; 50 if ($debug == true){ 51 echo "$message"; 52 } 53 */ 54 } 55 56 /** 57 * This function returns the final tag out of the tag name, an array 58 * of attributes, and the type of the tag. This function is called by 59 * sanitize internally. 60 * 61 * @param $tagname the name of the tag. 62 * @param $attary the array of attributes and their values 63 * @param $tagtype The type of the tag (see in comments). 64 * @return a string with the final tag representation. 65 */ 66 function tagprint($tagname, $attary, $tagtype){ 67 $me = 'tagprint'; 68 if ($tagtype == 2){ 69 $fulltag = '</' . $tagname . '>'; 70 } else { 71 $fulltag = '<' . $tagname; 72 if (is_array($attary) && sizeof($attary)){ 73 $atts = array(); 74 while (list($attname, $attvalue) = each($attary)){ 75 array_push($atts, "$attname=$attvalue"); 76 } 77 $fulltag .= ' ' . join(' ', $atts); 78 } 79 if ($tagtype == 3){ 80 $fulltag .= ' /'; 81 } 82 $fulltag .= '>'; 83 } 84 //htmlfilter_debug("$me: $fulltag\n"); 85 return $fulltag; 86 } 87 88 /** 89 * A small helper function to use with array_walk. Modifies a by-ref 90 * value and makes it lowercase. 91 * 92 * @param $val a value passed by-ref. 93 * @return void since it modifies a by-ref value. 94 */ 95 function casenormalize(&$val){ 96 $val = strtolower($val); 97 } 98 99 /** 100 * This function skips any whitespace from the current position within 101 * a string and to the next non-whitespace value. 102 * 103 * @param $body the string 104 * @param $offset the offset within the string where we should start 105 * looking for the next non-whitespace character. 106 * @return the location within the $body where the next 107 * non-whitespace char is located. 108 */ 109 function skipspace($body, $offset){ 110 $me = 'skipspace'; 111 preg_match('/^(\s*)/s', substr($body, $offset), $matches); 112 if (sizeof($matches{1})){ 113 $count = strlen($matches{1}); 114 //htmlfilter_debug("$me: skipped $count chars\n"); 115 $offset += $count; 116 } 117 return $offset; 118 } 119 120 /** 121 * This function looks for the next character within a string. It's 122 * really just a glorified "strpos", except it catches the failures 123 * nicely. 124 * 125 * @param $body The string to look for needle in. 126 * @param $offset Start looking from this position. 127 * @param $needle The character/string to look for. 128 * @return location of the next occurance of the needle, or 129 * strlen($body) if needle wasn't found. 130 */ 131 function findnxstr($body, $offset, $needle){ 132 $me = 'findnxstr'; 133 $pos = strpos($body, $needle, $offset); 134 if ($pos === FALSE){ 135 $pos = strlen($body); 136 //htmlfilter_debug("$me: end of body reached\n"); 137 } 138 //htmlfilter_debug("$me: '$needle' found at pos $pos\n"); 139 return $pos; 140 } 141 142 /** 143 * This function takes a PCRE-style regexp and tries to match it 144 * within the string. 145 * 146 * @param $body The string to look for needle in. 147 * @param $offset Start looking from here. 148 * @param $reg A PCRE-style regex to match. 149 * @return Returns a false if no matches found, or an array 150 * with the following members: 151 * - integer with the location of the match within $body 152 * - string with whatever content between offset and the match 153 * - string with whatever it is we matched 154 */ 155 function findnxreg($body, $offset, $reg){ 156 $me = 'findnxreg'; 157 $matches = array(); 158 $retarr = array(); 159 $preg_rule = '%^(.*?)(' . $reg . ')%s'; 160 preg_match($preg_rule, substr($body, $offset), $matches); 161 if (!isset($matches{0})){ 162 //htmlfilter_debug("$me: No matches found.\n"); 163 $retarr = false; 164 } else { 165 $retarr{0} = $offset + strlen($matches{1}); 166 $retarr{1} = $matches{1}; 167 $retarr{2} = $matches{2}; 168 //htmlfilter_debug("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n"); 169 } 170 return $retarr; 171 } 172 173 /** 174 * This function looks for the next tag. 175 * 176 * @param $body String where to look for the next tag. 177 * @param $offset Start looking from here. 178 * @return false if no more tags exist in the body, or 179 * an array with the following members: 180 * - string with the name of the tag 181 * - array with attributes and their values 182 * - integer with tag type (1, 2, or 3) 183 * - integer where the tag starts (starting "<") 184 * - integer where the tag ends (ending ">") 185 * first three members will be false, if the tag is invalid. 186 */ 187 function getnxtag($body, $offset){ 188 $me = 'getnxtag'; 189 if ($offset > strlen($body)){ 190 //htmlfilter_debug("$me: Past the end of body\n"); 191 return false; 192 } 193 $lt = findnxstr($body, $offset, '<'); 194 if ($lt == strlen($body)){ 195 //htmlfilter_debug("$me: No more tags found!\n"); 196 return false; 197 } 198 /** 199 * We are here: 200 * blah blah <tag attribute="value"> 201 * \---------^ 202 */ 203 //htmlfilter_debug("$me: Found '<' at pos $lt\n"); 204 $pos = skipspace($body, $lt + 1); 205 if ($pos >= strlen($body)){ 206 //htmlfilter_debug("$me: End of body reached.\n"); 207 return array(false, false, false, $lt, strlen($body)); 208 } 209 /** 210 * There are 3 kinds of tags: 211 * 1. Opening tag, e.g.: 212 * <a href="blah"> 213 * 2. Closing tag, e.g.: 214 * </a> 215 * 3. XHTML-style content-less tag, e.g.: 216 * <img src="blah"/> 217 */ 218 $tagtype = false; 219 switch (substr($body, $pos, 1)){ 220 case '/': 221 //htmlfilter_debug("$me: This is a closing tag (type 2)\n"); 222 $tagtype = 2; 223 $pos++; 224 break; 225 case '!': 226 /** 227 * A comment or an SGML declaration. 228 */ 229 if (substr($body, $pos+1, 2) == '--'){ 230 //htmlfilter_debug("$me: A comment found. Stripping.\n"); 231 $gt = strpos($body, '-->', $pos); 232 if ($gt === false){ 233 $gt = strlen($body); 234 } else { 235 $gt += 2; 236 } 237 return array(false, false, false, $lt, $gt); 238 } else { 239 //htmlfilter_debug("$me: An SGML declaration found. Stripping.\n"); 240 $gt = findnxstr($body, $pos, '>'); 241 return array(false, false, false, $lt, $gt); 242 } 243 break; 244 default: 245 /** 246 * Assume tagtype 1 for now. If it's type 3, we'll switch values 247 * later. 248 */ 249 $tagtype = 1; 250 break; 251 } 252 253 $tag_start = $pos; 254 $tagname = ''; 255 /** 256 * Look for next [\W-_], which will indicate the end of the tag name. 257 */ 258 $regary = findnxreg($body, $pos, '[^\w\-_]'); 259 if ($regary == false){ 260 //htmlfilter_debug("$me: End of body reached while analyzing tag name\n"); 261 return array(false, false, false, $lt, strlen($body)); 262 } 263 list($pos, $tagname, $match) = $regary; 264 $tagname = strtolower($tagname); 265 266 /** 267 * $match can be either of these: 268 * '>' indicating the end of the tag entirely. 269 * '\s' indicating the end of the tag name. 270 * '/' indicating that this is type-3 xhtml tag. 271 * 272 * Whatever else we find there indicates an invalid tag. 273 */ 274 switch ($match){ 275 case '/': 276 /** 277 * This is an xhtml-style tag with a closing / at the 278 * end, like so: <img src="blah"/>. Check if it's followed 279 * by the closing bracket. If not, then this tag is invalid 280 */ 281 if (substr($body, $pos, 2) == '/>'){ 282 //htmlfilter_debug("$me: XHTML-style tag found.\n"); 283 $pos++; 284 //htmlfilter_debug("$me: Setting tagtype to 3\n"); 285 $tagtype = 3; 286 } else { 287 //htmlfilter_debug("$me: Found invalid character '/'.\n"); 288 $gt = findnxstr($body, $pos, '>'); 289 //htmlfilter_debug("$me: Tag is invalid. Returning.\n"); 290 $retary = array(false, false, false, $lt, $gt); 291 return $retary; 292 } 293 case '>': 294 //htmlfilter_debug("$me: End of tag found at $pos\n"); 295 //htmlfilter_debug("$me: Tagname is '$tagname'\n"); 296 //htmlfilter_debug("$me: This tag has no attributes\n"); 297 return array($tagname, false, $tagtype, $lt, $pos); 298 break; 299 default: 300 /** 301 * Check if it's whitespace 302 */ 303 if (preg_match('/\s/', $match)){ 304 //htmlfilter_debug("$me: Tagname is '$tagname'\n"); 305 } else { 306 /** 307 * This is an invalid tag! Look for the next closing ">". 308 */ 309 //htmlfilter_debug("$me: Invalid characters found in tag name: $match\n"); 310 $gt = findnxstr($body, $lt, '>'); 311 return array(false, false, false, $lt, $gt); 312 } 313 } 314 315 /** 316 * At this point we're here: 317 * <tagname attribute='blah'> 318 * \-------^ 319 * 320 * At this point we loop in order to find all attributes. 321 */ 322 $attname = ''; 323 $atttype = false; 324 $attary = array(); 325 326 while ($pos <= strlen($body)){ 327 $pos = skipspace($body, $pos); 328 if ($pos == strlen($body)){ 329 /** 330 * Non-closed tag. 331 */ 332 //htmlfilter_debug("$me: End of body reached before end of tag. Discarding.\n"); 333 return array(false, false, false, $lt, $pos); 334 } 335 /** 336 * See if we arrived at a ">" or "/>", which means that we reached 337 * the end of the tag. 338 */ 339 $matches = array(); 340 preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches); 341 if (isset($matches{0}) && $matches{0}){ 342 /** 343 * Yep. So we did. 344 */ 345 //htmlfilter_debug("$me: Arrived at the end of the tag.\n"); 346 $pos += strlen($matches{1}); 347 if ($matches{2} == '/>'){ 348 $tagtype = 3; 349 $pos++; 350 } 351 return array($tagname, $attary, $tagtype, $lt, $pos); 352 } 353 354 /** 355 * There are several types of attributes, with optional 356 * [:space:] between members. 357 * Type 1: 358 * attrname[:space:]=[:space:]'CDATA' 359 * Type 2: 360 * attrname[:space:]=[:space:]"CDATA" 361 * Type 3: 362 * attr[:space:]=[:space:]CDATA 363 * Type 4: 364 * attrname 365 * 366 * We leave types 1 and 2 the same, type 3 we check for 367 * '"' and convert to """ if needed, then wrap in 368 * double quotes. Type 4 we convert into: 369 * attrname="yes". 370 */ 371 $regary = findnxreg($body, $pos, '[^\w\-_]'); 372 if ($regary == false){ 373 /** 374 * Looks like body ended before the end of tag. 375 */ 376 //htmlfilter_debug("$me: End of body found before end of tag.\n"); 377 //htmlfilter_debug("$me: Invalid, returning\n"); 378 return array(false, false, false, $lt, strlen($body)); 379 } 380 list($pos, $attname, $match) = $regary; 381 $attname = strtolower($attname); 382 //htmlfilter_debug("$me: Attribute '$attname' found\n"); 383 /** 384 * We arrived at the end of attribute name. Several things possible 385 * here: 386 * '>' means the end of the tag and this is attribute type 4 387 * '/' if followed by '>' means the same thing as above 388 * '\s' means a lot of things -- look what it's followed by. 389 * anything else means the attribute is invalid. 390 */ 391 switch($match){ 392 case '/': 393 /** 394 * This is an xhtml-style tag with a closing / at the 395 * end, like so: <img src="blah"/>. Check if it's followed 396 * by the closing bracket. If not, then this tag is invalid 397 */ 398 if (substr($body, $pos, 2) == '/>'){ 399 //htmlfilter_debug("$me: This is an xhtml-style tag.\n"); 400 $pos++; 401 //htmlfilter_debug("$me: Setting tagtype to 3\n"); 402 $tagtype = 3; 403 } else { 404 //htmlfilter_debug("$me: Found invalid character '/'.\n"); 405 $gt = findnxstr($body, $pos, '>'); 406 //htmlfilter_debug("$me: Tag is invalid. Returning.\n"); 407 $retary = array(false, false, false, $lt, $gt); 408 return $retary; 409 } 410 case '>': 411 //htmlfilter_debug("$me: found type 4 attribute.\n"); 412 //htmlfilter_debug("$me: Additionally, end of tag found at $pos\n"); 413 //htmlfilter_debug("$me: Attname is '$attname'\n"); 414 //htmlfilter_debug("$me: Setting attvalue to 'yes'\n"); 415 $attary{$attname} = '"yes"'; 416 return array($tagname, $attary, $tagtype, $lt, $pos); 417 break; 418 default: 419 /** 420 * Skip whitespace and see what we arrive at. 421 */ 422 $pos = skipspace($body, $pos); 423 $char = substr($body, $pos, 1); 424 /** 425 * Two things are valid here: 426 * '=' means this is attribute type 1 2 or 3. 427 * \w means this was attribute type 4. 428 * anything else we ignore and re-loop. End of tag and 429 * invalid stuff will be caught by our checks at the beginning 430 * of the loop. 431 */ 432 if ($char == '='){ 433 //htmlfilter_debug("$me: Attribute type 1, 2, or 3 found.\n"); 434 $pos++; 435 $pos = skipspace($body, $pos); 436 /** 437 * Here are 3 possibilities: 438 * "'" attribute type 1 439 * '"' attribute type 2 440 * everything else is the content of tag type 3 441 */ 442 $quot = substr($body, $pos, 1); 443 if ($quot == '\''){ 444 //htmlfilter_debug("$me: In fact, this is attribute type 1\n"); 445 //htmlfilter_debug("$me: looking for closing quote\n"); 446 $regary = findnxreg($body, $pos+1, '\''); 447 if ($regary == false){ 448 //htmlfilter_debug("$me: end of body reached before end of val\n"); 449 //htmlfilter_debug("$me: Returning\n"); 450 return array(false, false, false, $lt, strlen($body)); 451 } 452 list($pos, $attval, $match) = $regary; 453 //htmlfilter_debug("$me: Attvalue is '$attval'\n"); 454 $pos++; 455 $attary{$attname} = '\'' . $attval . '\''; 456 } else if ($quot == '"'){ 457 //htmlfilter_debug("$me: In fact, this is attribute type 2\n"); 458 //htmlfilter_debug("$me: looking for closing quote\n"); 459 $regary = findnxreg($body, $pos+1, '\"'); 460 if ($regary == false){ 461 //htmlfilter_debug("$me: end of body reached before end of val\n"); 462 //htmlfilter_debug("$me: Returning\n"); 463 return array(false, false, false, $lt, strlen($body)); 464 } 465 list($pos, $attval, $match) = $regary; 466 //htmlfilter_debug("$me: Attvalue is \"$attval\"\n"); 467 $pos++; 468 $attary{$attname} = '"' . $attval . '"'; 469 } else { 470 //htmlfilter_debug("$me: This looks like attribute type 3\n"); 471 /** 472 * These are hateful. Look for \s, or >. 473 */ 474 //htmlfilter_debug("$me: Looking for end of attval\n"); 475 $regary = findnxreg($body, $pos, '[\s>]'); 476 if ($regary == false){ 477 //htmlfilter_debug("$me: end of body reached before end of val\n"); 478 //htmlfilter_debug("$me: Returning\n"); 479 return array(false, false, false, $lt, strlen($body)); 480 } 481 list($pos, $attval, $match) = $regary; 482 /** 483 * If it's ">" it will be caught at the top. 484 */ 485 //htmlfilter_debug("$me: translating '\"' into "\n"); 486 $attval = preg_replace('/\"/s', '"', $attval); 487 //htmlfilter_debug("$me: wrapping in quotes\n"); 488 $attary{$attname} = '"' . $attval . '"'; 489 } 490 } else if (preg_match('|[\w/>]|', $char)) { 491 /** 492 * That was attribute type 4. 493 */ 494 //htmlfilter_debug("$me: attribute type 4 found.\n"); 495 //htmlfilter_debug("$me: Setting value to 'yes'\n"); 496 $attary{$attname} = '"yes"'; 497 } else { 498 /** 499 * An illegal character. Find next '>' and return. 500 */ 501 //htmlfilter_debug("$me: illegal character '$char' found.\n"); 502 //htmlfilter_debug("$me: returning\n"); 503 $gt = findnxstr($body, $pos, '>'); 504 return array(false, false, false, $lt, $gt); 505 } 506 } 507 } 508 /** 509 * The fact that we got here indicates that the tag end was never 510 * found. Return invalid tag indication so it gets stripped. 511 */ 512 //htmlfilter_debug("$me: No tag end found\n"); 513 return array(false, false, false, $lt, strlen($body)); 514 } 515 516 /** 517 * Translates entities into literal values so they can be checked. 518 * 519 * @param $attvalue the by-ref value to check. 520 * @param $regex the regular expression to check against. 521 * @param $hex whether the entites are hexadecimal. 522 * @return True or False depending on whether there were matches. 523 */ 524 function deent(&$attvalue, $regex, $hex=false){ 525 $me = 'deent'; 526 //htmlfilter_debug("$me: matching '$regex' against: $attvalue\n"); 527 $ret_match = false; 528 preg_match_all($regex, $attvalue, $matches); 529 if (is_array($matches) && sizeof($matches[0]) > 0){ 530 //htmlfilter_debug("$me: found " . sizeof($matches[0]) . " matches\n"); 531 $repl = array(); 532 for ($i = 0; $i < sizeof($matches[0]); $i++){ 533 $numval = $matches[1][$i]; 534 //htmlfilter_debug("$me: numval is $numval\n"); 535 if ($hex){ 536 $numval = hexdec($numval); 537 //htmlfilter_debug("$me: hex! Numval is now $numval\n"); 538 } 539 $repl{$matches[0][$i]} = chr($numval); 540 } 541 $attvalue = strtr($attvalue, $repl); 542 //htmlfilter_debug("$me: attvalue after translation: $attvalue\n"); 543 return true; 544 } else { 545 //htmlfilter_debug("$me: no matches! Returning false.\n"); 546 return false; 547 } 548 } 549 550 /** 551 * This function checks attribute values for entity-encoded values 552 * and returns them translated into 8-bit strings so we can run 553 * checks on them. 554 * 555 * @param $attvalue A string to run entity check against. 556 * @return Nothing, modifies a reference value. 557 */ 558 function defang(&$attvalue){ 559 $me = 'defang'; 560 /** 561 * Skip this if there aren't ampersands or backslashes. 562 */ 563 //htmlfilter_debug("$me: Checking '$attvalue' for suspicious content\n"); 564 if (strpos($attvalue, '&') === false 565 && strpos($attvalue, '\\') === false){ 566 //htmlfilter_debug("$me: no suspicious content found, returning.\n"); 567 return; 568 } 569 $m = false; 570 do { 571 $m = false; 572 $m = $m || deent($attvalue, '/\�*(\d+);*/s'); 573 $m = $m || deent($attvalue, '/\�*((\d|[a-f])+);*/si', true); 574 $m = $m || deent($attvalue, '/\\\\(\d+)/s', true); 575 //htmlfilter_debug("$me: m=$m\n"); 576 } while ($m == true); 577 $attvalue = stripslashes($attvalue); 578 //htmlfilter_debug("$me: translated into: $attvalue\n"); 579 } 580 581 /** 582 * Kill any tabs, newlines, or carriage returns. Our friends the 583 * makers of the browser with 95% market value decided that it'd 584 * be funny to make "java[tab]script" be just as good as "javascript". 585 * 586 * @param attvalue The attribute value before extraneous spaces removed. 587 * @return attvalue Nothing, modifies a reference value. 588 */ 589 function unspace(&$attvalue){ 590 $me = 'unspace'; 591 if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){ 592 //htmlfilter_debug("$me: Killing whitespace.\n"); 593 $attvalue = str_replace(array("\t", "\r", "\n", "\0", " "), 594 array('', '', '', '', ''), $attvalue); 595 } 596 //htmlfilter_debug("$me: after unspace: $attvalue\n"); 597 } 598 599 /** 600 * This function runs various checks against the attributes. 601 * 602 * @param $tagname String with the name of the tag. 603 * @param $attary Array with all tag attributes. 604 * @param $rm_attnames See description for sanitize 605 * @param $bad_attvals See description for sanitize 606 * @param $add_attr_to_tag See description for sanitize 607 * @return Array with modified attributes. 608 */ 609 function fixatts($tagname, 610 $attary, 611 $rm_attnames, 612 $bad_attvals, 613 $add_attr_to_tag 614 ){ 615 $me = 'fixatts'; 616 //htmlfilter_debug("$me: Fixing attributes\n"); 617 while (list($attname, $attvalue) = each($attary)){ 618 /** 619 * See if this attribute should be removed. 620 */ 621 foreach ($rm_attnames as $matchtag=>$matchattrs){ 622 if (preg_match($matchtag, $tagname)){ 623 foreach ($matchattrs as $matchattr){ 624 if (preg_match($matchattr, $attname)){ 625 //htmlfilter_debug("$me: Attribute '$attname' defined as bad.\n"); 626 //htmlfilter_debug("$me: Removing.\n"); 627 unset($attary{$attname}); 628 continue; 629 } 630 } 631 } 632 } 633 /** 634 * Remove any backslashes, entities, or extraneous whitespace. 635 */ 636 defang($attvalue); 637 unspace($attvalue); 638 639 /** 640 * Now let's run checks on the attvalues. 641 * I don't expect anyone to comprehend this. If you do, 642 * get in touch with me so I can drive to where you live and 643 * shake your hand personally. :) 644 */ 645 foreach ($bad_attvals as $matchtag=>$matchattrs){ 646 if (preg_match($matchtag, $tagname)){ 647 foreach ($matchattrs as $matchattr=>$valary){ 648 if (preg_match($matchattr, $attname)){ 649 /** 650 * There are two arrays in valary. 651 * First is matches. 652 * Second one is replacements 653 */ 654 list($valmatch, $valrepl) = $valary; 655 $newvalue = preg_replace($valmatch,$valrepl,$attvalue); 656 if ($newvalue != $attvalue){ 657 //htmlfilter_debug("$me: attvalue is now $newvalue\n"); 658 $attary{$attname} = $newvalue; 659 } 660 } 661 } 662 } 663 } 664 } 665 /** 666 * See if we need to append any attributes to this tag. 667 */ 668 foreach ($add_attr_to_tag as $matchtag=>$addattary){ 669 if (preg_match($matchtag, $tagname)){ 670 $attary = array_merge($attary, $addattary); 671 //htmlfilter_debug("$me: Added attributes to this tag\n"); 672 } 673 } 674 return $attary; 675 } 676 677 /** 678 * This is the main function and the one you should actually be calling. 679 * There are several variables you should be aware of an which need 680 * special description. 681 * 682 * $tag_list 683 * ---------- 684 * This is a simple one-dimentional array of strings, except for the 685 * very first one. The first member should be einter false or true. 686 * In case it's FALSE, the following list will be considered a list of 687 * tags that should be explicitly REMOVED from the body, and all 688 * others that did not match the list will be allowed. If the first 689 * member is TRUE, then the list is the list of tags that should be 690 * explicitly ALLOWED -- any tag not matching this list will be 691 * discarded. 692 * 693 * Examples: 694 * $tag_list = array( 695 * false, 696 * "blink", 697 * "link", 698 * "object", 699 * "meta", 700 * "marquee", 701 * "html" 702 * ); 703 * 704 * This will allow all tags except for blink, link, object, meta, marquee, 705 * and html. 706 * 707 * $tag_list = array( 708 * true, 709 * "b", 710 * "a", 711 * "i", 712 * "img", 713 * "strong", 714 * "em", 715 * "p" 716 * ); 717 * 718 * This will remove all tags from the body except b, a, i, img, strong, em and 719 * p. 720 * 721 * $rm_tags_with_content 722 * --------------------- 723 * This is a simple one-dimentional array of strings, which specifies the 724 * tags to be removed with any and all content between the beginning and 725 * the end of the tag. 726 * Example: 727 * $rm_tags_with_content = array( 728 * "script", 729 * "style", 730 * "applet", 731 * "embed" 732 * ); 733 * 734 * This will remove the following structure: 735 * <script> 736 * window.alert("Isn't cross-site-scripting fun?!"); 737 * </script> 738 * 739 * $self_closing_tags 740 * ------------------ 741 * This is a simple one-dimentional array of strings, which specifies which 742 * tags contain no content and should not be forcefully closed if this option 743 * is turned on (see further). 744 * Example: 745 * $self_closing_tags = array( 746 * "img", 747 * "br", 748 * "hr", 749 * "input" 750 * ); 751 * 752 * $force_tag_closing 753 * ------------------ 754 * Set it to true to forcefully close any tags opened within the document. 755 * This is good if you want to take care of people who like to screw up 756 * the pages by leaving unclosed tags like <a>, <b>, <i>, etc. 757 * 758 * $rm_attnames 759 * ------------- 760 * Now we come to parameters that are more obscure. This parameter is 761 * a nested array which is used to specify which attributes should be 762 * removed. It goes like so: 763 * 764 * $rm_attnames = array( 765 * "PCRE regex to match tag name" => 766 * array( 767 * "PCRE regex to match attribute name" 768 * ) 769 * ); 770 * 771 * Example: 772 * $rm_attnames = array( 773 * "|.*|" => 774 * array( 775 * "|target|i", 776 * "|^on.*|i" 777 * ) 778 * ); 779 * 780 * This will match all attributes (.*), and specify that all attributes 781 * named "target" and starting with "on" should be removed. This will take 782 * care of the following problem: 783 * <em onmouseover="window.alert('muahahahaha')"> 784 * The "onmouseover" will be removed. 785 * 786 * $bad_attvals 787 * ------------ 788 * This is where it gets ugly. This is a nested array with many levels. 789 * It goes like so: 790 * 791 * $bad_attvals = array( 792 * "pcre regex to match tag name" => 793 * array( 794 * "pcre regex to match attribute name" => 795 * array( 796 * "pcre regex to match attribute value" 797 * ) 798 * array( 799 * "pcre regex replace a match from above with" 800 * ) 801 * ) 802 * ); 803 * 804 * An extensive example: 805 * 806 * $bad_attvals = array( 807 * "|.*|" => 808 * array( 809 * "/^src|background|href|action/i" => 810 * array( 811 * array( 812 * "/^([\'\"])\s*\S+script\s*:.*([\'\"])/si" 813 * ), 814 * array( 815 * "\\1http://veryfunny.com/\\2" 816 * ) 817 * ), 818 * "/^style/i" => 819 * array( 820 * array( 821 * "/expression/si", 822 * "/url\(([\'\"])\s*https*:.*([\'\"])\)/si", 823 * "/url\(([\'\"])\s*\S+script:.*([\'\"])\)/si" 824 * ), 825 * array( 826 * "idiocy", 827 * "url(\\1http://veryfunny.com/\\2)", 828 * "url(\\1http://veryfynny.com/\\2)" 829 * ) 830 * ) 831 * ) 832 * ); 833 * 834 * This will take care of nearly all known cross-site scripting exploits, 835 * plus some (see my filter sample at 836 * http://www.mricon.com/html/phpfilter.html for a working version). 837 * 838 * $add_attr_to_tag 839 * ---------------- 840 * This is a useful little feature which lets you add attributes to 841 * certain tags. It is a nested array as well, but not at all like 842 * the previous one. It goes like so: 843 * 844 * $add_attr_to_tag = array( 845 * "PCRE regex to match tag name" => 846 * array( 847 * "attribute name"=>'"attribute value"' 848 * ) 849 * ); 850 * 851 * Note: don't forget quotes around attribute value. 852 * 853 * Example: 854 * 855 * $add_attr_to_tag = array( 856 * "/^a$/si" => 857 * array( 858 * 'target'=>'"_new"' 859 * ) 860 * ); 861 * 862 * This will change all <a> tags and add target="_new" to them so all links 863 * open in a new window. 864 * 865 * 866 * 867 * @param $body the string with HTML you wish to filter 868 * @param $tag_list see description above 869 * @param $rm_tags_with_content see description above 870 * @param $self_closing_tags see description above 871 * @param $force_tag_closing see description above 872 * @param $rm_attnames see description above 873 * @param $bad_attvals see description above 874 * @param $add_attr_to_tag see description above 875 * @return sanitized html safe to show on your pages. 876 */ 877 function htmlfilter_sanitize($body, 878 $tag_list = array(), 879 $rm_tags_with_content = array(), 880 $self_closing_tags = array(), 881 $force_tag_closing = true, 882 $rm_attnames = array(), 883 $bad_attvals = array(), 884 $add_attr_to_tag = array() 885 ){ 886 $me = 'sanitize'; 887 /** 888 * Normalize rm_tags and rm_tags_with_content. 889 */ 890 @array_walk($tag_list, 'casenormalize'); 891 @array_walk($rm_tags_with_content, 'casenormalize'); 892 @array_walk($self_closing_tags, 'casenormalize'); 893 /** 894 * See if tag_list is of tags to remove or tags to allow. 895 * false means remove these tags 896 * true means allow these tags 897 */ 898 $rm_tags = array_shift($tag_list); 899 $curpos = 0; 900 $open_tags = array(); 901 $trusted = ''; //"<!-- begin sanitized html -->\n"; 902 $skip_content = false; 903 /** 904 * Take care of netscape's stupid javascript entities like 905 * &{alert('boo')}; 906 */ 907 $body = preg_replace('/&(\{.*?\};)/si', '&\\1', trim($body)); 908 //htmlfilter_debug("$me: invoking the loop\n"); 909 while (($curtag = getnxtag($body, $curpos)) != FALSE){ 910 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag; 911 //htmlfilter_debug("$me: grabbing free-standing content\n"); 912 $free_content = substr($body, $curpos, $lt - $curpos); 913 //htmlfilter_debug("$me: " . strlen($free_content) . " chars grabbed\n"); 914 if ($skip_content == false){ 915 //htmlfilter_debug("$me: appending free content to trusted.\n"); 916 $trusted .= $free_content; 917 } else { 918 //htmlfilter_debug("$me: Skipping free content.\n"); 919 } 920 if ($tagname != FALSE){ 921 //htmlfilter_debug("$me: tagname is '$tagname'\n"); 922 if ($tagtype == 2){ 923 //htmlfilter_debug("$me: This is a closing tag\n"); 924 if ($skip_content == $tagname){ 925 /** 926 * Got to the end of tag we needed to remove. 927 */ 928 //htmlfilter_debug("$me: Finished removing tag with content\n"); 929 $tagname = false; 930 $skip_content = false; 931 } else { 932 if ($skip_content == false){ 933 if (isset($open_tags{$tagname}) && 934 $open_tags{$tagname} > 0){ 935 //htmlfilter_debug("$me: popping '$tagname' from open_tags\n"); 936 $open_tags{$tagname}--; 937 } else { 938 //htmlfilter_debug("$me: '$tagname' was never opened\n"); 939 //htmlfilter_debug("$me: removing\n"); 940 $tagname = false; 941 } 942 } else { 943 //htmlfilter_debug("$me: Skipping this tag\n"); 944 } 945 } 946 } else { 947 /** 948 * $rm_tags_with_content 949 */ 950 if ($skip_content == false){ 951 /** 952 * See if this is a self-closing type and change 953 * tagtype appropriately. 954 */ 955 if ($tagtype == 1 956 && in_array($tagname, $self_closing_tags)){ 957 //htmlfilter_debug("$me: Self-closing tag. Changing tagtype.\n"); 958 $tagtype = 3; 959 } 960 /** 961 * See if we should skip this tag and any content 962 * inside it. 963 */ 964 if ($tagtype == 1 965 && in_array($tagname, $rm_tags_with_content)){ 966 //htmlfilter_debug("$me: removing this tag with content\n"); 967 $skip_content = $tagname; 968 } else { 969 if (($rm_tags == false 970 && in_array($tagname, $tag_list)) || 971 ($rm_tags == true 972 && !in_array($tagname, $tag_list))){ 973 //htmlfilter_debug("$me: Removing this tag.\n"); 974 $tagname = false; 975 } else { 976 if ($tagtype == 1){ 977 //htmlfilter_debug("$me: adding '$tagname' to open_tags\n"); 978 if (isset($open_tags{$tagname})){ 979 $open_tags{$tagname}++; 980 } else { 981 $open_tags{$tagname} = 1; 982 } 983 } 984 /** 985 * This is where we run other checks. 986 */ 987 if (is_array($attary) && sizeof($attary) > 0){ 988 $attary = fixatts($tagname, 989 $attary, 990 $rm_attnames, 991 $bad_attvals, 992 $add_attr_to_tag); 993 } 994 } 995 } 996 } else { 997 //htmlfilter_debug("$me: Skipping this tag\n"); 998 } 999 } 1000 if ($tagname != false && $skip_content == false){ 1001 //htmlfilter_debug("$me: Appending tag to trusted.\n"); 1002 $trusted .= tagprint($tagname, $attary, $tagtype); 1003 } 1004 } else { 1005 //htmlfilter_debug("$me: Removing invalid tag\n"); 1006 } 1007 $curpos = $gt + 1; 1008 } 1009 //htmlfilter_debug("$me: Appending any leftover content\n"); 1010 $trusted .= substr($body, $curpos, strlen($body) - $curpos); 1011 if ($force_tag_closing == true){ 1012 foreach ($open_tags as $tagname=>$opentimes){ 1013 while ($opentimes > 0){ 1014 //htmlfilter_debug("$me: '$tagname' left open. Closing by force.\n"); 1015 $trusted .= '</' . $tagname . '>'; 1016 $opentimes--; 1017 } 1018 } 1019 $trusted .= " "; 1020 } 1021 // $trusted .= "<!-- end sanitized html -->\n"; 1022 return $trusted; 1023 } 1024 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Sun Jan 29 16:31:14 2012 | Cross-referenced by PHPXref 0.7.1 |