Published in: PHP
URL: http://www.dominicsayers.com/isemail/
A PHP function that correctly validates all parts of a given email address, according to RFCs 5322, 5321, 1123, 2396, 3696, 4291, 4343, 2821 & 2822. I’ve released it under a license that allows you to use it royalty-free in commercial or non-commercial work.
The test cases and the latest version of the code will always be here: http://code.google.com/p/isemail/source/browse/#svn/trunk
Expand |
Embed | Plain Text
<?php /* Copyright 2009 Dominic Sayers dominic_sayers@hotmail.com http://www.dominicsayers.com Version 1.7 This source file is subject to the Common Public Attribution License Version 1.0 (CPAL) license. The license terms are available through the world-wide-web at http://www.opensource.org/licenses/cpal_1.0 */ // PHPLint modules /*. require_module 'standard'; require_module 'pcre'; .*/ /*.boolean.*/ function is_email (/*.string.*/ $email, $checkDNS = false) { // Check that $email is a valid address. Read the following RFCs to understand the constraints: // (http://tools.ietf.org/html/rfc5322) // (http://tools.ietf.org/html/rfc3696) // (http://tools.ietf.org/html/rfc5321) // (http://tools.ietf.org/html/rfc4291#section-2.2) // (http://tools.ietf.org/html/rfc1123#section-2.1) // the upper limit on address lengths should normally be considered to be 256 // (http://www.rfc-editor.org/errata_search.php?rfc=3696) // NB I think John Klensin is misreading RFC 5321 and the the limit should actually be 254 // However, I will stick to the published number until it is changed. // // The maximum total length of a reverse-path or forward-path is 256 // characters (including the punctuation and element separators) // (http://tools.ietf.org/html/rfc5321#section-4.5.3.1.3) if ($emailLength > 256) return false; // Too long // Contemporary email addresses consist of a "local part" separated from // a "domain part" (a fully-qualified domain name) by an at-sign ("@"). // (http://tools.ietf.org/html/rfc3696#section-3) if ($atIndex === false) return false; // No at-sign if ($atIndex === 0) return false; // No local part if ($atIndex === $emailLength) return false; // No domain part // Sanitize comments // - remove nested comments, quotes and dots in comments // - remove parentheses and dots from quoted strings $braceDepth = 0; $inQuote = false; $escapeThisChar = false; for ($i = 0; $i < $emailLength; ++$i) { $char = $email[$i]; $replaceChar = false; if ($char === '\\') { $escapeThisChar = !$escapeThisChar; // Escape the next character? } else { switch ($char) { case '(': if ($escapeThisChar) { $replaceChar = true; } else { if ($inQuote) { $replaceChar = true; } else { if ($braceDepth++ > 0) $replaceChar = true; // Increment brace depth } } break; case ')': if ($escapeThisChar) { $replaceChar = true; } else { if ($inQuote) { $replaceChar = true; } else { if (--$braceDepth > 0) $replaceChar = true; // Decrement brace depth if ($braceDepth < 0) $braceDepth = 0; } } break; case '"': if ($escapeThisChar) { $replaceChar = true; } else { if ($braceDepth === 0) { $inQuote = !$inQuote; // Are we inside a quoted string? } else { $replaceChar = true; } } break; case '.': // Dots don't help us either if ($escapeThisChar) { $replaceChar = true; } else { if ($braceDepth > 0) $replaceChar = true; } break; } $escapeThisChar = false; if ($replaceChar) $email[$i] = 'x'; // Replace the offending character with something harmless } } $FWS = "(?:(?:(?:[ \\t]*(?:\\r\\n))?[ \\t]+)|(?:[ \\t]+(?:(?:\\r\\n)[ \\t]+)*))"; // Folding white space // Let's check the local part for RFC compliance... // // local-part = dot-atom / quoted-string / obs-local-part // obs-local-part = word *("." word) // (http://tools.ietf.org/html/rfc5322#section-3.4.1) // // Problem: need to distinguish between "first.last" and "first"."last" // (i.e. one element or two). And I suck at regexes. $dotArray = /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $localPart); $partLength = 0; foreach ($dotArray as $element) { // Remove any leading or trailing FWS // Then we need to remove all valid comments (i.e. those at the start or end of the element if ($element[0] === '(') { if ($indexBrace !== false) { return false; // Illegal characters in comment } } } if ($element[$elementLength - 1] === ')') { if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) { return false; // Illegal characters in comment } } } // Remove any leading or trailing FWS around the element (inside any comments) // What's left counts towards the maximum length for this part if ($partLength > 0) $partLength++; // for the dot // Each dot-delimited component can be an atom or a quoted string // (because of the obs-local-part provision) // Quoted-string tests: // // Remove any FWS // My regex skillz aren't up to distinguishing between \" \\" \\\" \\\\" etc. // So remove all \\ from the string first... if (preg_match('/(?<!\\\\|^)["\\r\\n\\x00](?!$)|\\\\"$|""/', $element) > 0) return false; // ", CR, LF and NUL must be escaped, "" is too short } else { // Unquoted string tests: // // Period (".") may...appear, but may not be used to start or end the // local part, nor may two or more consecutive periods appear. // (http://tools.ietf.org/html/rfc3696#section-3) // // A zero-length element implies a period at the beginning or end of the // local part, or two periods together. Either way it's not allowed. if ($element === '') return false; // Dots in wrong place // Any ASCII graphic (printing) character other than the // at-sign ("@"), backslash, double quote, comma, or square brackets may // appear without quoting. If any of that list of excluded characters // are to appear, they must be quoted // (http://tools.ietf.org/html/rfc3696#section-3) // // Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, " if (preg_match('/[\\x00-\\x20\\(\\)<>\\[\\]:;@\\\\,\\."]/', $element) > 0) return false; // These characters must be in a quoted string } } if ($partLength > 64) return false; // Local part must be 64 characters or less // Now let's check the domain part... // The domain name can also be replaced by an IP address in square brackets // (http://tools.ietf.org/html/rfc3696#section-3) // (http://tools.ietf.org/html/rfc5321#section-4.1.3) // (http://tools.ietf.org/html/rfc4291#section-2.2) // It's an address-literal // Extract IPv4 part from the end of the address-literal (if there is one) if (preg_match('/\\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/', $addressLiteral, $matchesIP) > 0) { if ($index === 0) { // Nothing there except a valid IPv4 address, so... return true; } else { // Assume it's an attempt at a mixed address (IPv6 + IPv4) if ($addressLiteral[$index - 1] !== ':') return false; // Character preceding IPv4 address must be ':' $groupMax = 6; } } else { // It must be an attempt at pure IPv6 $groupMax = 8; } if ($index === false) { // We need exactly the right number of groups if ($groupCount !== $groupMax) return false; // RFC5321 section 4.1.3 } else { if ($groupCount > $groupMax) return false; // Too many IPv6 groups in address } // Check for unmatched characters if ($matchesIP[1][0] !== '') return false; // Illegal characters in address // It's a valid IPv6 address, so... return true; } else { // It's a domain name... // The syntax of a legal Internet host name was specified in RFC-952 // One aspect of host name syntax is hereby changed: the // restriction on the first character is relaxed to allow either a // letter or a digit. // (http://tools.ietf.org/html/rfc1123#section-2.1) // // NB RFC 1123 updates RFC 1035, but this is not currently apparent from reading RFC 1035. // // Most common applications, including email and the Web, will generally not // permit...escaped strings // (http://tools.ietf.org/html/rfc3696#section-2) // // the better strategy has now become to make the "at least one period" test, // to verify LDH conformance (including verification that the apparent TLD name // is not all-numeric) // (http://tools.ietf.org/html/rfc3696#section-2) // // Characters outside the set of alphabetic characters, digits, and hyphen MUST NOT appear in domain name // labels for SMTP clients or servers // (http://tools.ietf.org/html/rfc5321#section-4.1.2) // // RFC5321 precludes the use of a trailing dot in a domain name for SMTP purposes // (http://tools.ietf.org/html/rfc5321#section-4.1.2) $dotArray = /*. (array[int]string) .*/ preg_split('/\\.(?=(?:[^\\"]*\\"[^\\"]*\\")*(?![^\\"]*\\"))/m', $domain); $partLength = 0; foreach ($dotArray as $element) { // Remove any leading or trailing FWS // Then we need to remove all valid comments (i.e. those at the start or end of the element if ($element[0] === '(') { if ($indexBrace !== false) { return false; // Illegal characters in comment } } } if ($element[$elementLength - 1] === ')') { if ($indexBrace !== false) { if (preg_match('/(?<!\\\\)(?:[\\(\\)])/', substr($element, $indexBrace + 1, $elementLength - $indexBrace - 2)) > 0) { return false; // Illegal characters in comment } } } // Remove any leading or trailing FWS around the element (inside any comments) // What's left counts towards the maximum length for this part if ($partLength > 0) $partLength++; // for the dot // The DNS defines domain name syntax very generally -- a // string of labels each containing up to 63 8-bit octets, // separated by dots, and with a maximum total of 255 // octets. // (http://tools.ietf.org/html/rfc1123#section-6.1.3.5) if ($elementLength > 63) return false; // Label must be 63 characters or less // Each dot-delimited component must be atext // A zero-length element implies a period at the beginning or end of the // local part, or two periods together. Either way it's not allowed. if ($elementLength === 0) return false; // Dots in wrong place // Any ASCII graphic (printing) character other than the // at-sign ("@"), backslash, double quote, comma, or square brackets may // appear without quoting. If any of that list of excluded characters // are to appear, they must be quoted // (http://tools.ietf.org/html/rfc3696#section-3) // // If the hyphen is used, it is not permitted to appear at // either the beginning or end of a label. // (http://tools.ietf.org/html/rfc3696#section-2) // // Any excluded characters? i.e. 0x00-0x20, (, ), <, >, [, ], :, ;, @, \, comma, period, " return false; } } if ($partLength > 255) return false; // Local part must be 64 characters or less // Check DNS? return false; // Domain doesn't actually exist } } } // Eliminate all other factors, and the one which remains must be the truth. // (Sherlock Holmes, The Sign of Four) return true; } ?>
You need to login to post a comment.