Posted By

wizard04 on 06/24/08


Tagged

regex email url javascript parse validation uri mailto address


Versions (?)

Who likes this?

6 people have marked this snippet as a favorite

korzhik
LeeRJohnson
vali29
harikaram
Tyster
uggnot


Regular Expressions For URI Validation/Parsing


 / Published in: Regular Expression
 

(Supported by JavaScript, maybe other languages)

  1. //replace() can be used to parse the URI. For example, to get the path:
  2. // path = uri.replace(regexUri, "$7$9");
  3.  
  4. //****************************************************//
  5. //***************** Validate a URI *******************//
  6. //****************************************************//
  7. //- The different parts are kept in their own groups and can be recombined
  8. // depending on the scheme:
  9. // - http as $1://$2$7?$11#$12 or $1://$5:$6$7?$11#$12
  10. // - ftp as $1://$2$7 or $1://$4@$5:$6$7
  11. // - mailto as $1:$9?$11
  12. //- groups are as follows:
  13. // 1 == scheme
  14. // 2 == authority
  15. // 4 == userinfo
  16. // 5 == host (loose check to allow for IPv6 addresses)
  17. // 6 == port
  18. // 7,9 == path (7 if it has an authority, 9 if it doesn't)
  19. // 11 == query
  20. // 12 == fragment
  21.  
  22. var regexUri = /^([a-z][a-z0-9+.-]*):(?:\/\/((?:(?=((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*))(\3)@)?(?=(\[[0-9A-F:.]{2,}\]|(?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*))\5(?::(?=(\d*))\6)?)(\/(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*))\8)?|(\/?(?!\/)(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*))\10)?)(?:\?(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/?]|%[0-9A-F]{2})*))\11)?(?:#(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/?]|%[0-9A-F]{2})*))\12)?$/i;
  23. /*composed as follows:
  24. ^
  25. ([a-z][a-z0-9+.-]*): #1 scheme
  26. (?:
  27. \/\/ it has an authority:
  28.  
  29. ( #2 authority
  30. (?:(?=((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*))(\3)@)? #4 userinfo
  31. (?=(\[[0-9A-F:.]{2,}\]|(?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*))\5 #5 host (loose check to allow for IPv6 addresses)
  32. (?::(?=(\d*))\6)? #6 port
  33. )
  34.  
  35. (\/(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*))\8)? #7 path
  36.  
  37. | it doesn't have an authority:
  38.  
  39. (\/?(?!\/)(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*))\10)? #9 path
  40. )
  41. (?:
  42. \?(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/?]|%[0-9A-F]{2})*))\11 #11 query string
  43. )?
  44. (?:
  45. #(?=((?:[a-z0-9-._~!$&'()*+,;=:@\/?]|%[0-9A-F]{2})*))\12 #12 fragment
  46. )?
  47. $
  48. */
  49.  
  50. //****************************************************//
  51. //** Validate a URI (includes delimiters in groups) **//
  52. //****************************************************//
  53. //- The different parts--along with their delimiters--are kept in their own
  54. // groups and can be recombined as $1$6$2$3$4$5$7$8$9
  55. //- groups are as follows:
  56. // 1,6 == scheme:// or scheme:
  57. // 2 == userinfo@
  58. // 3 == host
  59. // 4 == :port
  60. // 5,7 == path (5 if it has an authority, 7 if it doesn't)
  61. // 8 == ?query
  62. // 9 == #fragment
  63.  
  64. var regexUriDelim = /^(?:([a-z0-9+.-]+:\/\/)((?:(?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)?((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*)(:(?:\d*))?(\/(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?|([a-z0-9+.-]+:)(\/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?)(\?(?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*)?(#(?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*)?$/i;
  65.  
  66. //****************************************************//
  67. //***************** Validate a URL *******************//
  68. //****************************************************//
  69. //Validates a URI with an http or https scheme.
  70. //- The different parts are kept in their own groups and can be recombined as
  71. // $1://$2:$3$4?$5#$6
  72. //- Does not validate the host portion (domain); just makes sure the string
  73. // consists of valid characters (does not include IPv6 nor IPvFuture
  74. // addresses as valid).
  75.  
  76. var regexUrl = /^(https?):\/\/((?:[a-z0-9.-]|%[0-9A-F]{2}){3,})(?::(\d+))?((?:\/(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})*)*)(?:\?((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?(?:#((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;
  77.  
  78. //****************************************************//
  79. //**************** Validate a Mailto *****************//
  80. //****************************************************//
  81. //Validates a URI with a mailto scheme.
  82. //- The different parts are kept in their own groups and can be recombined as
  83. // $1:$2?$3
  84. //- Does not validate the email addresses themselves.
  85.  
  86. var regexMailto = /^(mailto):((?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+)?(?:\?((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;

Report this snippet  

Comments

RSS Icon Subscribe to comments
Posted By: harikaram on October 15, 2010

Can URL paths really consist of "&,',! and $ unescaped?

Posted By: wizard04 on October 20, 2010

Yup. See http://tools.ietf.org/html/rfc3986

Posted By: uggnot on April 25, 2013

the only flaw i see with the the var 'regexUri' is that it allows the scheme to start with a number.

According to the spec you've linked to rfc3986 section 3.1 "Scheme" paragraph 2:

"Scheme names consist of a sequence of characters beginning with a letter and followed by any combination of letters, digits, plus ("+"), period ("."), or hyphen ("-")."

simply change the first set of parenthesis after the ^ with ([a-z][a-z0-9+.-]*).

Sorry if i came off impolite, it wasn't meant that way. This code is a WONDERFUL resource.

Posted By: uggnot on April 25, 2013

also had a problem using the regexUri variable in javascript, needed to escape the forward slashes:

var regexUri = /^([a-z][a-z0-9+.-]):(?:\/\/(?:((?:[a-z0-9-._~!$&'()+,;=:]|%[0-9A-F]{2}))@)?((?:[a-z0-9-._~!$&'()+,;=]|%[0-9A-F]{2}))(?::(\d))?(\/(?:[a-z0-9-.~!$&'()+,;=:@\/]|%[0-9A-F]{2}))?|(\/?(?:[a-z0-9-.~!$&'()+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()+,;=:@\/]|%[0-9A-F]{2}))?)(?:\?((?:[a-z0-9-._~!$&'()+,;=:\/?@]|%[0-9A-F]{2})))?(?:#((?:[a-z0-9-._~!$&'()+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;

Posted By: wizard04 on April 16, 2014

Sorry I haven't logged into this site in a long time. My latest iteration is at https://github.com/wizard04wsu/URI_Parsing

You need to login to post a comment.