Posted By

wizard04 on 06/25/08


Tagged

email url javascript parse readable domain uri ip mailto normalize


Versions (?)

Who likes this?

3 people have marked this snippet as a favorite

korzhik
joomla
jschilling


Parse URIs


 / Published in: JavaScript
 

Functions for validating, parsing, and normalizing URIs and their parts. If you find any errors, please leave a comment.

parseURI(str) splits a URI into its parts parseQueryNumeric(str) splits a query string into its name/value pairs; returns a 2-D array parseQueryAssociative(str) splits a query string into its name/value pairs; returns an associative array parseURL(str) splits a URL (i.e., http(s) scheme URI) into its parts normalizeURLDomain(domain) converts an obscured URL domain to a more readable one normalizeIPv4(ip) normalizes an IPv4 address normalizeIPv6(ip) normalizes an IPv6 address normalizeURLPath(path) converts an obscured URL path to a more readable one parseMailto(str) splits a mailto scheme URI into its parts normalizeEmailAddress(str) converts an obscured email address to a more readable one; unfolds and removes comments fixURL(str, domain) attempts to fix a URL if needed fixHyperlink(str, domain, allowMailto) attempts to fix a hyperlink address (http(s) or mailto) if needed

For URLs, note that IPvFuture addresses are not supported.

  1. //****************************************************************
  2. //**************************** URI *******************************
  3. //****************************************************************
  4.  
  5. //splits a URI into its parts
  6. //returns null if str is not a valid URI
  7. //does not support IPvFuture domains
  8. //see RFC 3986 http://www.faqs.org/rfcs/rfc3986.html
  9. function parseURI(str)
  10. {
  11. if(!str) return null;
  12.  
  13. var regexUri = /^([a-z0-9+.-]+):(?:\/\/(?:((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)?((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*|\[(?:[0-9A-F:.]{2,})\])(?::(\d*))?(\/(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?|(\/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)?)(?:\?((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?(?:#((?:[a-z0-9-._~!$&'()*+,;=:\/?@]|%[0-9A-F]{2})*))?$/i;
  14. //'
  15. /*composed as follows:
  16. ^
  17. ([a-z0-9+.-]+): #scheme
  18. (?:
  19. \/\/ #it has an authority:
  20. (?:((?:[a-z0-9-._~!$&'()*+,;=:]|%[0-9A-F]{2})*)@)? #userinfo
  21. ((?:[a-z0-9-._~!$&'()*+,;=]|%[0-9A-F]{2})*|\[(?:[0-9A-F:.]{2,})\]) #host (loose check to allow for IPv6 addresses)
  22. (?::(\d*))? #port
  23. (\/(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)? #path
  24. |
  25. #it doesn't have an authority:
  26. (\/?(?:[a-z0-9-._~!$&'()*+,;=:@]|%[0-9A-F]{2})+(?:[a-z0-9-._~!$&'()*+,;=:@\/]|%[0-9A-F]{2})*)? #path
  27. )
  28. (?:
  29. \?((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*) #query string
  30. )?
  31. (?:
  32. #((?:[a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*) #fragment
  33. )?
  34. $
  35. */
  36. if(!regexUri.test(str)) return null; //invalid URI
  37.  
  38. //these extra steps are required to check for validity of the host depending on if it's a URL or not,
  39. // since URLs allow IPv6 addresses (i.e., they allow '[', ':', and ']')
  40. var scheme = str.replace(regexUri, "$1").toLowerCase();
  41. var host = str.replace(regexUri, "$3");
  42. if(host && (scheme == "http" || scheme == "https")) //if it's a URL
  43. {
  44. if(!normalizeURLDomain(host)) return null; //invalid host
  45. }
  46. else if(host) //host may not include '[', ':', or ']'
  47. {
  48. if((/[:\[\]]/).test(host)) return null; //invalid host
  49. }
  50.  
  51. var parts = {
  52. uri: scheme+str.slice(scheme.length), //make sure scheme is lower case
  53. scheme: scheme,
  54. authority: "", //userinfo@host:port
  55. userinfo: str.replace(regexUri, "$2"),
  56. host: host,
  57. port: str.replace(regexUri, "$4"),
  58. path: str.replace(regexUri, "$5$6"),
  59. query: str.replace(regexUri, "$7"),
  60. fragment: str.replace(regexUri, "$8")
  61. };
  62. parts.authority = (parts.userinfo ? parts.userinfo+"@" : "") + parts.host + (parts.port ? ":"+parts.port : "");
  63.  
  64. return parts;
  65. }
  66.  
  67. //splits a query string into its name/value pairs
  68. //returns a 2-D array
  69. function parseQueryNumeric(str)
  70. {
  71. var results = []; //array of objects {name, value}
  72.  
  73. var pairs = str.split("&");
  74. var pair, j, result;
  75. for(var i=0; i<pairs.length; i++)
  76. {
  77. pair = pairs[i].split("=");
  78. if(!pair[0]) continue; //if there is no name, skip it
  79. result = {
  80. name: pair[0],
  81. value: ""
  82. };
  83. if(pair.length > 0) //if it has a value
  84. {
  85. result.value = pair[1]; //set the value
  86. for(j=2; j<pair.length; j++) //if there is more than one "=", include its encoded form in the value
  87. {
  88. result.value += "%3D"+pair[j];
  89. }
  90. }
  91. results.push(result);
  92. }
  93.  
  94. return results;
  95. }
  96. //splits a query string into its name/value pairs
  97. //returns an associative array
  98. //if there are multiple pairs with the same name, the last pair is used
  99. function parseQueryAssociative(str)
  100. {
  101. var results = {}; //associative array
  102.  
  103. var pairs = str.split("&");
  104. var pair, j, result;
  105. for(var i=0; i<pairs.length; i++)
  106. {
  107. pair = pairs[i].split("=");
  108. if(!pair[0]) continue; //if there is no name, skip it
  109. results[pair[0]] = "";
  110. if(pair.length > 0) //if it has a value
  111. {
  112. results[pair[0]] = pair[1]; //set the value
  113. for(j=2; j<pair.length; j++) //if there is more than one "=", include its encoded form in the value
  114. {
  115. results[pair[0]] += "%3D"+pair[j];
  116. }
  117. }
  118. }
  119.  
  120. return results;
  121. }
  122.  
  123. //****************************************************************
  124. //**************************** URL *******************************
  125. //****************************************************************
  126.  
  127. //splits a URL (i.e., http(s) scheme URI) into its parts
  128. //returns null if str is not a valid URL
  129. //does not support IPvFuture domains
  130. //see RFC 2616 http://tools.ietf.org/html/rfc2616
  131. //note: according to the RFC, fragments aren't part of a URL (they're only used by the browser, never sent to the server)
  132. // but this function allows them anyway, of course
  133. function parseURL(str)
  134. {
  135. var uri = parseURI(str);
  136. if(!uri) return null; //invalid URI
  137. if((uri.scheme != "http" && uri.scheme != "https") || !uri.authority) return null; //it's not a URL
  138. if(!uri.host) return null; //no domain
  139.  
  140. var parts = {
  141. url: "",
  142. protocol: uri.scheme,
  143. authority: "", //domain:port
  144. domain: normalizeURLDomain(uri.host),
  145. port: uri.port, //defaults: http 80, https 443
  146. path: (normalizeURLPath(uri.path) || "/"),
  147. query: uri.query,
  148. anchor: uri.fragment
  149. };
  150. if(!parts.domain) return null; //invalid domain
  151. parts.authority = parts.domain + (parts.port ? ":"+parts.port : "");
  152. parts.url = parts.protocol + "://" + parts.authority + parts.path + (parts.query ? "?"+parts.query : "") +
  153. (parts.anchor ? "#"+parts.anchor : "");
  154.  
  155. return parts;
  156. }
  157.  
  158. //converts an obscured URL domain to a more readable one
  159. //returns "" if it's not a valid domain
  160. //does not support IPvFuture domains
  161. //see http://www.pc-help.org/obscure.htm
  162. // and RFC 1123 http://tools.ietf.org/html/rfc1123#section-2 (Section 2.1)
  163. // and RFC 952 http://tools.ietf.org/html/rfc952 (ASSUMPTIONS 1, GRAMMATICAL HOST TABLE SPECIFICATION)
  164. // and RFC 2181 http://tools.ietf.org/html/rfc2181#section-11 (Section 11)
  165. function normalizeURLDomain(domain)
  166. {
  167. if(!domain) return "";
  168. if(domain.toLowerCase() == "localhost") return "localhost";
  169.  
  170. domain = domain.replace(/%3(\d)/g, "$1"); //decimals
  171. //upper-case letters (converted to lower-case)
  172. domain = domain.replace(/%41/ig, "a").replace(/%42/ig, "b").replace(/%43/ig, "c").replace(/%44/ig, "d").replace(/%45/ig, "e");
  173. domain = domain.replace(/%46/ig, "f").replace(/%47/ig, "g").replace(/%48/ig, "h").replace(/%49/ig, "i").replace(/%4A/ig, "j");
  174. domain = domain.replace(/%4B/ig, "k").replace(/%4C/ig, "l").replace(/%4D/ig, "m").replace(/%4E/ig, "n").replace(/%4F/ig, "o");
  175. domain = domain.replace(/%50/ig, "p").replace(/%51/ig, "q").replace(/%52/ig, "r").replace(/%53/ig, "s").replace(/%54/ig, "t");
  176. domain = domain.replace(/%55/ig, "u").replace(/%56/ig, "v").replace(/%57/ig, "w").replace(/%58/ig, "x").replace(/%59/ig, "y");
  177. domain = domain.replace(/%5A/ig, "z");
  178. //lower-case letters
  179. domain = domain.replace(/%61/ig, "a").replace(/%62/ig, "b").replace(/%63/ig, "c").replace(/%64/ig, "d").replace(/%65/ig, "e");
  180. domain = domain.replace(/%66/ig, "f").replace(/%67/ig, "g").replace(/%68/ig, "h").replace(/%69/ig, "i").replace(/%6A/ig, "j");
  181. domain = domain.replace(/%6B/ig, "k").replace(/%6C/ig, "l").replace(/%6D/ig, "m").replace(/%6E/ig, "n").replace(/%6F/ig, "o");
  182. domain = domain.replace(/%70/ig, "p").replace(/%71/ig, "q").replace(/%72/ig, "r").replace(/%73/ig, "s").replace(/%74/ig, "t");
  183. domain = domain.replace(/%75/ig, "u").replace(/%76/ig, "v").replace(/%77/ig, "w").replace(/%78/ig, "x").replace(/%79/ig, "y");
  184. domain = domain.replace(/%7A/ig, "z");
  185. //allowed symbols
  186. domain = domain.replace(/%2D/ig, "-").replace(/%2E/ig, ".");
  187. domain = domain.replace(/%3A/ig, ":").replace(/%5B/ig, "[").replace(/%5D/ig, "]"); //for IPv6 addresses
  188. if((/[^a-z0-9:\[\].-]/i).test(domain)) return ""; //contains invalid characters
  189.  
  190. var ip;
  191. if(ip = normalizeIPv4(domain)) return ip; //it's a valid IPv4 address
  192. if(ip = normalizeIPv6(domain)) return ip; //it's a valid IPv6 address
  193.  
  194. //it's not an IP address
  195. if((/[:\[\]]/).test(domain)) return ""; //contains invalid characters
  196. if(domain.length > 255) return ""; //too long
  197. //note: the spec doesn't allow a name to start with a digit, but this is not enforced
  198. if((/^[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?)+$/i).test(domain))
  199. return domain; //valid domain
  200. return ""; //invalid domain
  201. }
  202.  
  203. function normalizeIPv4(ip)
  204. {
  205. if(!(/^(\d+|0x[0-9A-F]+)(\.(\d+|0x[0-9A-F]+)){3}$/i).test(ip)) return ''; //invalid
  206. var parts = ip.split(".");
  207. var val, dwordToIp;
  208. var vals = [];
  209. for(var i=0; i<parts.length; i++) //for each part
  210. {
  211. val = parseInt(parts[i]); //convert hex or octal to dword/decimal
  212.  
  213. //if this is the last part and it's a dword
  214. //e.g., in an IP of 1192362298 or 71.1179962 or 71.18.314
  215. if(i == parts.length-1 && i < 3)
  216. {
  217. //convert dword to decimal parts
  218. //e.g., 1179962 becomes 18.1.58
  219. dwordToIp = [];
  220. while(i < 4)
  221. {
  222. dwordToIp.unshift(val % 256);
  223. val = (val-dwordToIp[0]) / 256;
  224. i++;
  225. }
  226. vals = vals.concat(dwordToIp);
  227. break;
  228. }
  229. val = val % 256;
  230. vals.push(val);
  231. }
  232. return vals.join("."); //valid IP address
  233. }
  234.  
  235. //note: this includes the '[' and ']' characters on the ends of the IP (for use in a URL)
  236. function normalizeIPv6(ip)
  237. {
  238. if(ip.charAt(0) == '[' && ip.charAt(ip.length-1) == ']') ip = ip.slice(1,ip.length-1);
  239. ip = ip.split('::'); //split the IP at the '::' shortcut (if it's used)
  240. if(ip.length < 1 || ip.length > 2) return ''; //invalid IP
  241. var x = ip[0].split(':');
  242. if(x.length > 8 || (ip.length>1 && x.length+ip[1].split(':').length > 7)) return ''; //invalid IP
  243. var a = [], b = [];
  244. for(var i=0; i<x.length; i++) //for each part left of '::' (or of the entire IP if '::' isn't used)
  245. {
  246. if(x[0] == '') break; //there isn't anything on the left side
  247. if((/^[0-9A-F]{1,4}$/i).test(x[i])) a.push(normalizeIPv6.pad(x[i]));
  248. else if(ip.length==1 && i == x.length-1 && (x[i] = normalizeIPv6.v4to6(x[i])) != '') //last part of entire IP is a ver. 4 IP
  249. {
  250. //converted x[i] to a ver. 6 IP
  251. a.push(x[i].substr(0,4));
  252. a.push(x[i].substr(4,4));
  253. }
  254. else return ''; //invalid IP
  255. }
  256. if(ip.length>1) //if the shortcut was used
  257. {
  258. x = ip[1].split(':');
  259. for(i=0; i<x.length; i++) //for each part right of '::'
  260. {
  261. if(x[0] == '') break; //there isn't anything on the right side
  262. if((/^[0-9A-F]{1,4}$/i).test(x[i])) b.push(normalizeIPv6.pad(x[i]));
  263. else if(i == x.length-1 && (x[i] = normalizeIPv6.v4to6(x[i])) != '') //last part of entire IP is a ver. 4 IP
  264. {
  265. //converted x[i] to a ver. 6 IP
  266. b.push(x[i].substr(0,4));
  267. b.push(x[i].substr(5,4));
  268. }
  269. else return ''; //invalid IP
  270. }
  271. while(a.length+b.length < 8) //replace the shortcut with the zeroes it represents
  272. {
  273. a.push('0000');
  274. }
  275. }
  276. return '['+a.concat(b).join(':')+']';
  277. }
  278. normalizeIPv6.pad = function(x)
  279. {
  280. x = ''+x;
  281. while(x.length < 4){ x = '0'+x; }
  282. return x.toLowerCase();
  283. }
  284. normalizeIPv6.v4to6 = function(ip)
  285. {
  286. if(!normalizeIPv4(ip)) return ''; //invalid IP
  287. ip = ip.split('.');
  288. var h = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'];
  289. return '' + h[Math.floor(ip[0]/16)] + h[ip[0]%16] + h[Math.floor(ip[1]/16)] + h[ip[1]%16] +
  290. ':' + h[Math.floor(ip[2]/16)] + h[ip[2]%16] + h[Math.floor(ip[3]/16)] + h[ip[3]%16];
  291. }
  292.  
  293. //converts an obscured URL path to a more readable one
  294. function normalizeURLPath(path)
  295. {
  296. if(!path) return "";
  297.  
  298. path = path.replace(/%3(\d)/g, "$1"); //decimals
  299. path = path.replace(/%41/ig, "A").replace(/%42/ig, "B").replace(/%43/ig, "C").replace(/%44/ig, "D").replace(/%45/ig, "E");
  300. path = path.replace(/%46/ig, "F").replace(/%47/ig, "G").replace(/%48/ig, "H").replace(/%49/ig, "I").replace(/%4A/ig, "J");
  301. path = path.replace(/%4B/ig, "K").replace(/%4C/ig, "L").replace(/%4D/ig, "M").replace(/%4E/ig, "N").replace(/%4F/ig, "O");
  302. path = path.replace(/%50/ig, "P").replace(/%51/ig, "Q").replace(/%52/ig, "R").replace(/%53/ig, "S").replace(/%54/ig, "T");
  303. path = path.replace(/%55/ig, "U").replace(/%56/ig, "V").replace(/%57/ig, "W").replace(/%58/ig, "X").replace(/%59/ig, "Y");
  304. path = path.replace(/%5A/ig, "Z");
  305. path = path.replace(/%61/ig, "a").replace(/%62/ig, "b").replace(/%63/ig, "c").replace(/%64/ig, "d").replace(/%65/ig, "e");
  306. path = path.replace(/%66/ig, "f").replace(/%67/ig, "g").replace(/%68/ig, "h").replace(/%69/ig, "i").replace(/%6A/ig, "j");
  307. path = path.replace(/%6B/ig, "k").replace(/%6C/ig, "l").replace(/%6D/ig, "m").replace(/%6E/ig, "n").replace(/%6F/ig, "o");
  308. path = path.replace(/%70/ig, "p").replace(/%71/ig, "q").replace(/%72/ig, "r").replace(/%73/ig, "s").replace(/%74/ig, "t");
  309. path = path.replace(/%75/ig, "u").replace(/%76/ig, "v").replace(/%77/ig, "w").replace(/%78/ig, "x").replace(/%79/ig, "y");
  310. path = path.replace(/%7A/ig, "z");
  311. path = path.replace(/%2D/ig, "-").replace(/%2E/ig, ".").replace(/%5F/ig, "_").replace(/%7E/ig, "~").replace(/%21/ig, "!");
  312. path = path.replace(/%24/ig, "$").replace(/%27/ig, "'").replace(/%28/ig, "(").replace(/%29/ig, ")").replace(/%2A/ig, "*");
  313. path = path.replace(/%2B/ig, "+").replace(/%2C/ig, ",").replace(/%3B/ig, ";").replace(/%40/ig, "@");
  314. //path = path.replace(/%20/g, " "); //more readable, but not valid
  315.  
  316. return path;
  317. }
  318.  
  319. //****************************************************************
  320. //************************** Mailto ******************************
  321. //****************************************************************
  322.  
  323. //splits a mailto scheme URI into its parts
  324. //returns null if str is not a valid mailto URI or there is no destination
  325. //only includes valid email addresses; the rest are removed
  326. //does not support IPv6 or IPvFuture domains
  327. //see RFC 2368 http://tools.ietf.org/html/rfc2368
  328. function parseMailto(str)
  329. {
  330. var uri = parseURI(str);
  331. if(!uri || uri.scheme != "mailto" || uri.authority) return null;
  332. //note: if there is a fragment, it will simply be left out
  333.  
  334. uri.uri = uri.uri.replace(/%20/g, " ");
  335. uri.path = uri.path.replace(/%20/g, " ");
  336. uri.query = uri.query.replace(/%20/g, " ");
  337.  
  338. var parts = {
  339. uri: "",
  340. scheme: "mailto",
  341. to: [],
  342. cc: [],
  343. bcc: [],
  344. subject: "",
  345. body: "",
  346. headers: [] //other headers besides the above
  347. };
  348. var to1 = [], to2 = [];
  349.  
  350. if(uri.path)
  351. {
  352. to1 = to1.concat(splitEmailAddresses(uri.path));
  353. }
  354. var headers = parseQueryNumeric(uri.query);
  355. for(var i=0; i<headers.length; i++)
  356. {
  357. if(headers[i].value == "") continue;
  358. if(headers[i].name == "to")
  359. {
  360. to2 = to2.concat(splitEmailAddresses(headers[i].value));
  361. }
  362. else if(headers[i].name == "cc")
  363. {
  364. parts.cc = parts.cc.concat(splitEmailAddresses(headers[i].value));
  365. }
  366. else if(headers[i].name == "bcc")
  367. {
  368. parts.bcc = parts.bcc.concat(splitEmailAddresses(headers[i].value));
  369. }
  370. else if(headers[i].name == "subject")
  371. {
  372. parts.subject = headers[i].value;
  373. }
  374. else if(headers[i].name == "body")
  375. {
  376. parts.body = headers[i].value;
  377. }
  378. else
  379. {
  380. parts.headers.push(headers[i]);
  381. }
  382. }
  383.  
  384. parts.to = to1.concat(to2);
  385. if(parts.to.length == 0 && parts.cc.length == 0 && parts.bcc.length == 0) return null; //no destination
  386.  
  387. parts.uri = "mailto:";
  388. if(to1.length > 0)
  389. {
  390. for(i=0; i<to1.length; i++)
  391. {
  392. parts.uri += to1[i];
  393. if(i < to1.length-1) parts.uri += ",";
  394. }
  395. }
  396.  
  397. var qs = [];
  398. var q = -1;
  399. if(to2.length > 0)
  400. {
  401. qs[++q] = "to=";
  402. for(i=0; i<to2.length; i++)
  403. {
  404. qs[q] += to2[i];
  405. if(i < to2.length-1) qs[q] += ",";
  406. }
  407. }
  408. if(parts.cc.length > 0)
  409. {
  410. qs[++q] = "cc=";
  411. for(i=0; i<parts.cc.length; i++)
  412. {
  413. qs[q] += parts.cc[i];
  414. if(i < parts.cc.length-1) qs[q] += ",";
  415. }
  416. }
  417. if(parts.bcc.length > 0)
  418. {
  419. qs[++q] = "bcc=";
  420. for(i=0; i<parts.bcc.length; i++)
  421. {
  422. qs[q] += parts.bcc[i];
  423. if(i < parts.bcc.length-1) qs[q] += ",";
  424. }
  425. }
  426. if(parts.subject) qs[++q] = "subject="+parts.subject;
  427. if(parts.body) qs[++q] = "body="+parts.body;
  428. for(i=0; i<parts.headers.length; i++)
  429. {
  430. qs[++q] = parts.headers[i].name+"="+parts.headers[i].value;
  431. }
  432. if(qs.length > 0) parts.uri += "?"+qs.join("&");
  433.  
  434. return parts;
  435. }
  436. //helper function for parseMailto
  437. //splits the string at the commas, but ignores commas within quoted strings
  438. //only returns valid email addresses
  439. function splitEmailAddresses(str)
  440. {
  441. var addresses = [];
  442. var a = 0, c, m;
  443. var parts = str.split("\""); //split the string at the quotes
  444. str = "";
  445. var inQuote = false;
  446. for(var i=0; i<parts.length; i++)
  447. {
  448. if(inQuote) //currently inside a pair of quotes
  449. {
  450. str += "\"";
  451. if((/(^|[^\\])(\\\\)*\\$/).test(parts[i])) //part ends with the escape character (\)
  452. {
  453. str += parts[i];
  454. }
  455. else //end quote
  456. {
  457. str += parts[i];
  458. if(i < parts.length-1)
  459. {
  460. str += "\"";
  461. inQuote = false;
  462. }
  463. }
  464. }
  465. else //not inside a pair of quotes
  466. {
  467. //if((c=parts[i].search(/,|%2C/i)) > -1) //comma is found
  468. if((c=parts[i].search(/,/i)) > -1) //comma is found
  469. {
  470. addresses[a++] = str + parts[i].slice(0, c); //add the address that ends at the comma
  471. //m = parts[i].match(/(,|%2C)(\s|%20)*/i)[0].length;
  472. //str = parts[i].slice(c+m);
  473. str = parts[i].slice(c+1);
  474. }
  475. else str += parts[i];
  476. if(i < parts.length-1) inQuote = true; //if there are more parts
  477. else addresses[a] = str;
  478. }
  479. }
  480. if(inQuote) return []; //no closing quote
  481. //verify the email addresses
  482. for(i=0; i<addresses.length; i++)
  483. {
  484. addresses[i] = normalizeEmailAddress(addresses[i]);
  485. if(!addresses[i]) addresses.splice(i--,1); //if it's not valid, remove it
  486. }
  487. return addresses;
  488. }
  489.  
  490. //converts an obscured email address to a more readable one; unfolds and removes comments
  491. //returns "" if it's not a valid address
  492. //does not support IPv6 or IPvFuture domains
  493. //see RFC 2822 http://tools.ietf.org/html/rfc2822
  494. // and http://www.ilovejackdaniels.com/php/email-address-validation/
  495. //obsolete forms are not supported
  496. function normalizeEmailAddress(str)
  497. {
  498. if(!str) return "";
  499.  
  500. //remove comments
  501. //regular expressions do not support nesting, so I have to do this manually
  502. var c = 0; //nesting level of comments
  503. var s = ""; //new string
  504. var p, m, char; //position, match, end character
  505. var inQS = false; //inside a quoted string
  506. p = str.search(/(^|[^\\]+?)(\\\\)*[()"]/);
  507. while(p >= 0)
  508. {
  509. m = str.match(/(^|[^\\]+?)(\\\\)*[()"]/)[0];
  510. char = str.charAt(p+m.length-1);
  511. if(char == "\"")
  512. {
  513. if(c == 0) //beginning or end of a quoted string (not inside of a comment)
  514. {
  515. s += str.slice(0, p+m.length);
  516. inQS = !inQS;
  517. }
  518. str = str.slice(p+m.length);
  519. }
  520. else if(char == "(")
  521. {
  522. if(inQS) s += str.slice(0, p+m.length); //inside a quoted string
  523. else if(c++ == 0) s += str.slice(0, p+m.length-1); //beginning of a top-level comment
  524. str = str.slice(p+m.length);
  525. }
  526. else if(char == ")")
  527. {
  528. if(inQS) s += str.slice(0, p+m.length); //inside a quoted string
  529. else c--; //end of a comment
  530. str = str.slice(p+m.length);
  531. }
  532. if(c < 0) return ""; //invalid comment nesting
  533. p = str.search(/(^|[^\\]+)(\\\\)*[()"]/);
  534. }
  535. str = s + str;
  536.  
  537. str = str.replace(/\s+/g, " "); //replace whitespace with a single space
  538. str = str.replace(/[\\x01-\\x1F\\x7F]+/g, ""); //remove remaining (non-whitespace) control characters
  539.  
  540. var atext = "[!#$%&'*+`/0-9=?A-Z^_a-z{|}~-]";
  541. var qtext = "[!#$%&'()*+`./0-9:;<=>?@A-Z\\[\\]^_,a-z{|}~-]";
  542. var qptext = "("+qtext+"|[\"\\\\])";
  543. //var dtext = "[!\"#$%&'()*+`./0-9:;<=>?@A-Z^_,a-z{|}~-]"; //for IPv6 or IPvFuture formatted domains
  544.  
  545. var dotAtom = "( ?"+atext+"+(\\."+atext+"+)* ?)";
  546. var quotedString = "( ?\"( ?("+qtext+"|\\\\"+qptext+"))* ?\" )";
  547. //var domainLiteral = "( ?\\[( ?("+dtext+"|\\\\"+qptext+"))* ?\\] ?)"; //for IPv6 or IPvFuture formatted domains
  548.  
  549. var localPart = "("+dotAtom+"|"+quotedString+")";
  550. //var domain = "("+dotAtom+"|"+domainLiteral+")"; //we won't support IPv6 or IPvFuture formatted domains
  551. var domain = dotAtom;
  552. var addrSpec = "("+localPart+"@"+domain+")";
  553.  
  554. var displayName = "(( ?"+atext+"+ ?|"+quotedString+")+)";
  555. var nameAddr = "("+displayName+"? ?<"+addrSpec+"> ?)";
  556.  
  557. var mailbox = "^("+nameAddr+"|"+addrSpec+")$";
  558.  
  559. rxp = new RegExp(mailbox);
  560. if(rxp.test(str)) //valid mailbox so far
  561. {
  562. //get the domain
  563. rxp = new RegExp("@("+domain+")(> ?)?$");
  564. var d = str.match(rxp)[0];
  565. var dn = d.replace(rxp, "$2");
  566. d = d.replace(rxp, "$1");
  567. d = d.replace(/^ +| +$/g, ""); //remove spaces from ends
  568.  
  569. //normalize the domain
  570. var normalizedDomain = normalizeURLDomain(d);
  571. if(!normalizedDomain) return ""; //invalid domain
  572.  
  573. //replace the domain with the normalized version
  574. str = str.replace(rxp, "@"+normalizedDomain+(dn?">":""));
  575.  
  576. //get the local part
  577. rxp = new RegExp("("+localPart+")@"+normalizedDomain+">?");
  578. var lp = str.match(rxp)[0].replace(rxp, "$1");
  579. lp = lp.replace(/^ +| +$/g, ""); //remove spaces from ends
  580.  
  581. //replace local part with cleaned-up version
  582. str = str.replace(rxp, lp+"@"+normalizedDomain+(dn?">":""));
  583.  
  584. if(dn)
  585. {
  586. //get the display name, if there is one
  587. rxp = new RegExp("^"+displayName);
  588. var dn = str.match(rxp);
  589. if(dn)
  590. {
  591. dn = dn[0].replace(/^ +| +$/g, ""); //remove spaces from ends of display name
  592. str = str.replace(rxp, dn); //replace display name with cleaned-up version
  593. }
  594. }
  595.  
  596. return str; //valid mailbox
  597. }
  598. return ""; //invalid mailbox
  599. }
  600.  
  601. //****************************************************************
  602. //*************************** Fixes ******************************
  603. //****************************************************************
  604.  
  605. //attempts to fix a URL if needed
  606. //domain: domain to use if the url is relative
  607. //returns null if it can't be fixed
  608. function fixURL(str, domain)
  609. {
  610. str = str.replace(/ /g, "%20"); //make sure all spaces are escaped
  611. var url = parseURL(str);
  612. if(url) return url; //valid URL
  613.  
  614. domain = normalizeURLDomain(domain);
  615. str = str.replace(/"/g, "%22");
  616. str = str.replace(/</g, "%3C");
  617. str = str.replace(/>/g, "%3E");
  618. url = parseURI(str);
  619. if(!url && str.charAt(0) == "/") //relative path
  620. {
  621. if(!domain) return null; //invalid URL; can't fix it since no valid domain was given
  622. str = "http://"+domain+str;
  623. url = parseURL(str);
  624. if(url) return url; //it's now a valid URL
  625. url = parseURI(str);
  626. }
  627. if(!url && str.slice(0,7) != "http://" && str.slice(0,8) != "https://")
  628. {
  629. str = "http://"+str;
  630. url = parseURL(str);
  631. if(url) return url; //it's now a valid URL
  632. url = parseURI(str);
  633. }
  634. if(!url) return null; //invalid URI; can't be fixed
  635.  
  636. //valid URI; try to make it a valid URL
  637. str = url.scheme+"://";
  638. str += url.domain || domain;
  639. str += url.port ? ":"+url.port : "";
  640. str += normalizeURLPath(url.path)+(url.query ? "?"+url.query : "")+(url.fragment ? "#"+url.fragment : "");
  641.  
  642. url = parseURL(str);
  643. if(url) return url; //it's now a valid URL
  644. return null; //invalid URL; can't be fixed
  645. }
  646.  
  647. //attempts to fix a hyperlink address (http(s) or mailto) if needed
  648. //domain = domain to use if the url is relative
  649. //returns "" if it can't be fixed
  650. function fixHyperlink(str, domain, allowMailto)
  651. {
  652. domain = domain || "";
  653.  
  654. //get the scheme
  655. var matches = str.match(/^[a-z0-9+.-]+:/i);
  656. var scheme = (matches ? matches[0].slice(0, matches[0].length-1).toLowerCase() : "");
  657. if(scheme != "http" && scheme != "https" && (allowMailto ? scheme!="mailto" : true)) scheme = "";
  658.  
  659. if(!scheme || scheme == "http" || scheme == "https") //URL or unknown scheme (assume unknown is meant to be a URL)
  660. {
  661. var lnk = fixURL(str, domain);
  662. if(lnk) return lnk.url;
  663. }
  664. else if(allowMailto) //mailto address
  665. {
  666. var lnk = parseMailto(str);
  667. if(lnk) return lnk.uri;
  668. }
  669. return ""; //can't be fixed
  670. }

Report this snippet  

Comments

RSS Icon Subscribe to comments
Posted By: wizard04 on December 30, 2008

Fixed an issue with slashes on line 12. Don't know how I missed that before...

Posted By: wizard04 on December 30, 2008

Updated to allow IPv6 addresses.

Posted By: wizard04 on April 16, 2014

The latest: https://github.com/wizard04wsu/URI_Parsing

You need to login to post a comment.