/ Published in: PHP
URL: http://www.dom111.co.uk/blog/coding/xml-entities-in-php/224
Because htmlentities just doesn't cut it.
Recently needed this to work with 3rd part software, ended up having to make it pretty robust so thought I'd share!
Expand |
Embed | Plain Text
<?php /** * unicode_ord * * Returns the unicode value of the string * * @param string $c The source string * @param integer $i The index to get the char from (passed by reference for use in a loop) * @return integer The value of the char at $c[$i] * @author kerry at shetline dot com * @author Dom Hastings - modified to suit my needs * @see http://www.php.net/manual/en/function.ord.php#78032 */ function unicode_ord(&$c, &$i = 0) { // get the character length // copy the offset $index = $i; // check it's a valid offset if ($index >= $l) { return false; } // check the value // if it's ascii if ($o <= 0x7F) { return $o; // not sure what it is... } elseif ($o < 0xC2) { return false; // if it's a two-byte character } elseif ($o <= 0xDF && $index < $l - 1) { $i += 1; // three-byte } elseif ($o <= 0xEF && $index < $l - 2) { $i += 2; // four-byte } elseif ($o <= 0xF4 && $index < $l - 3) { $i += 3; // not sure what it is... } else { return false; } } /** * unicode_chr * * @param string $c * @return string * @author Miguel Perez * @see http://www.php.net/manual/en/function.chr.php#77911 */ function unicode_chr(&$c) { if ($c <= 0x7F) { } else if ($c <= 0x7FF) { } else if ($c <= 0xFFFF) { } else if ($c <= 0x10FFFF) { } else { return false; } } /** * xmlentities * * Makes the specified string XML-safe * * @param string $s * @param boolean $hex Whether or not to make hexadecimal entities (as opposed to decimal) * @return string The XML-safe result * @author Dom Hastings * @dependencies unicode_ord() * @see http://www.w3.org/TR/REC-xml/#sec-predefined-ent */ function xmlentities($s, $hex = true) { // if the string is empty // just return it return $s; } // create the return string $r = ''; // get the length // iterate the string for ($i = 0; $i < $l; $i++) { // get the value of the character $o = unicode_ord($s, $i); // valid cahracters $v = ( // \t \n <vertical tab> <form feed> \r ($o >= 9 && $o <= 13) || // <space> ! ($o == 32) || ($o == 33) || // # $ % ($o >= 35 && $o <= 37) || // ( ) * + , - . / ($o >= 40 && $o <= 47) || // numbers ($o >= 48 && $o <= 57) || // : ; ($o == 58) || ($o == 59) || // = ? ($o == 61) || ($o == 63) || // @ ($o == 64) || // uppercase ($o >= 65 && $o <= 90) || // [ \ ] ^ _ ` ($o >= 91 && $o <= 96) || // lowercase ($o >= 97 && $o <= 122) || // { | } ~ ($o >= 123 && $o <= 126) ); // if it's valid, just keep it if ($v) { $r .= $s[$i]; // & } elseif ($o == 38) { $r .= '&'; // < } elseif ($o == 60) { $r .= '<'; // > } elseif ($o == 62) { $r .= '>'; // ' } elseif ($o == 39) { $r .= '''; // " } elseif ($o == 34) { $r .= '"'; // unknown, add it as a reference } elseif ($o > 0) { if ($hex) { } else { $r .= '&#'.$o.';'; } } } return $r; } /** * xmlentity_decode * * Converts XML entity encoded data back to a unicode string * * @param string $s The XML encoded string * @param array $entities Additional entities to decode (optional) * @return string * @dependencies unicode_chr() * @author Dom Hastings */ // if the string is empty, just return it return $s; } // check that entities is an array throw new Exception('xmlentity_decode expects argument 2 to be array.'); } // initialise vars $r = ''; // merge the entities with the defaults (amp, lt, gt, apos and quot MUST take precedence) 'amp' => '&', 'lt' => '<', 'gt' => '>', 'apos' => '\'', 'quot' => '"' )); // loop through the string for ($i = 0; $i < $l; $i++) { // if it looks like an entity if ($s[$i] == '&') { // initialise some vars $e = ''; $c = ''; // loop until we find a semi-colon for ($j = ++$i; ($c != ';' && $j < $l); $j++) { // get the char $c = $s[$j]; // if it's not a semi-colon if ($c != ';') { // add it to the temporary entity string $e .= $c; } } // update the index $i = ($j - 1); // if the first char is a #, it's a numeric entity if ($e[0] == '#') { // if the second char is x it's a hexadecimal entity if ($e[1] == 'x') { // store the number } else { // store the number } } // if we got a number // get the unicode char from it $r .= unicode_chr($e); // otherwise } else { // if it's in our array (which it should be) // append the character $r .= $entities[$e]; // otherwise } else { // throw an exception, we don't know what to do with this throw new Exception('Unknown entity "'.$e.'"'); } } // if it's just a regular char } else { // append it $r .= $s[$i]; } } return $r; }
You need to login to post a comment.
