Posted By

wizard04 on 06/24/08


Tagged

regex email url javascript parse validation uri mailto address


Versions (?)

Who likes this?

6 people have marked this snippet as a favorite

korzhik
LeeRJohnson
vali29
harikaram
Tyster
uggnot


Regular Expressions For URI Validation/Parsing


 / Published in: Regular Expression
 

(Supported by JavaScript, maybe other languages)

  1. //replace() can be used to parse the URI. For example, to get the path:
  2. // path = uri.replace(regexUri, "$5$6");
  3.  
  4. //****************************************************//
  5. //***************** Validate a URI *******************//
  6. //****************************************************//
  7. //- The different parts are kept in their own groups and can be recombined
  8. // depending on the scheme:
  9. // - http as $1://$3:$4$5?$7#$8
  10. // - ftp as $1://$2@$3:$4$5
  11. // - mailto as $1:$6?$7
  12. //- groups are as follows:
  13. // 1 == scheme
  14. // 2 == userinfo
  15. // 3 == host
  16. // 4 == port
  17. // 5,6 == path (5 if it has an authority, 6 if it doesn't)
  18. // 7 == query
  19. // 8 == fragment
  20.  
  21. var regexUri = /^([a-z0-9+.-]+):(?://(?:((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)?((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*)(?::(\d*))?(/(?:[a-z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?|(/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?)(?:\?((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*))?(?:#((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*))?$/i;
  22. /*composed as follows:
  23. ^
  24. ([a-z0-9+.-]+): #scheme
  25. (?:
  26. // #it has an authority:
  27. (?:((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)? #userinfo
  28. ((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*) #host
  29. (?::(\d*))? #port
  30. (/(?:[a-z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)? #path
  31. |
  32. #it doesn't have an authority:
  33. (/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)? #path
  34. )
  35. (?:
  36. \?((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*) #query string
  37. )?
  38. (?:
  39. #((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*) #fragment
  40. )?
  41. $
  42. */
  43.  
  44. //****************************************************//
  45. //** Validate a URI (includes delimiters in groups) **//
  46. //****************************************************//
  47. //- The different parts--along with their delimiters--are kept in their own
  48. // groups and can be recombined as $1$6$2$3$4$5$7$8$9
  49. //- groups are as follows:
  50. // 1,6 == scheme:// or scheme:
  51. // 2 == userinfo@
  52. // 3 == host
  53. // 4 == :port
  54. // 5,7 == path (5 if it has an authority, 7 if it doesn't)
  55. // 8 == ?query
  56. // 9 == #fragment
  57.  
  58. var regexUriDelim = /^(?:([a-z0-9+.-]+:\/\/)((?:(?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)?((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*)(:(?:\d*))?(\/(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?|([a-z0-9+.-]+:)(\/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?)(\?(?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*)?(#(?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*)?$/i;
  59.  
  60. //****************************************************//
  61. //***************** Validate a URL *******************//
  62. //****************************************************//
  63. //Validates a URI with an http or https scheme.
  64. //- The different parts are kept in their own groups and can be recombined as
  65. // $1://$2:$3$4?$5#$6
  66. //- Does not validate the host portion (domain); just makes sure the string
  67. // consists of valid characters (does not include IPv6 nor IPvFuture
  68. // addresses as valid).
  69.  
  70. var regexUrl = /^(https?):\/\/((?:[a-z0-9.-]|%[0-9A-F]{2}){3,})(?::(\d+))?((?:\/(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})*)*)(?:\?((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?(?:#((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;
  71.  
  72. //****************************************************//
  73. //**************** Validate a Mailto *****************//
  74. //****************************************************//
  75. //Validates a URI with a mailto scheme.
  76. //- The different parts are kept in their own groups and can be recombined as
  77. // $1:$2?$3
  78. //- Does not validate the email addresses themselves.
  79.  
  80. var regexMailto = /^(mailto):((?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+)?(?:\?((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;

Report this snippet  

Comments

RSS Icon Subscribe to comments
Posted By: harikaram on October 15, 2010

Can URL paths really consist of "&,',! and $ unescaped?

Posted By: wizard04 on October 20, 2010

Yup. See http://tools.ietf.org/html/rfc3986

Posted By: uggnot on April 25, 2013

the only flaw i see with the the var 'regexUri' is that it allows the scheme to start with a number.

According to the spec you've linked to rfc3986 section 3.1 "Scheme" paragraph 2:

"Scheme names consist of a sequence of characters beginning with a letter and followed by any combination of letters, digits, plus ("+"), period ("."), or hyphen ("-")."

simply change the first set of parenthesis after the ^ with ([a-z][a-z0-9+.-]*).

Sorry if i came off impolite, it wasn't meant that way. This code is a WONDERFUL resource.

Posted By: uggnot on April 25, 2013

also had a problem using the regexUri variable in javascript, needed to escape the forward slashes:

var regexUri = /^([a-z][a-z0-9+.-]):(?:\/\/(?:((?:[a-z0-9-._~!$&'()+,;=:]|%[0-9A-F]{2}))@)?((?:[a-z0-9-._~!$&'()+,;=]|%[0-9A-F]{2}))(?::(\d))?(\/(?:[a-z0-9-.~!$&'()+,;=:@\/]|%[0-9A-F]{2}))?|(\/?(?:[a-z0-9-.~!$&'()+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()+,;=:@\/]|%[0-9A-F]{2}))?)(?:\?((?:[a-z0-9-._~!$&'()+,;=:\/?@]|%[0-9A-F]{2})))?(?:#((?:[a-z0-9-._~!$&'()+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;

You need to login to post a comment.