Posted By

tclancy on 02/09/10


Tagged

html Net cleaner


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

mamona


.NET HTML Formatter


 / Published in: C#
 

Requires TidyATL library for .NET - http://www.devx.com/dotnet/Article/20505/1763/page/2

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. using System.Text.RegularExpressions;
  5. using System.Collections;
  6.  
  7. using TidyATL;
  8.  
  9. using ProjectName.Core.Interfaces;
  10.  
  11. namespace ProjectName.Core.Content
  12. {
  13. public class HtmlFormatter : IFormatter
  14. {
  15. private string _contents;
  16. private Hashtable _tagMap = new Hashtable();
  17. private ArrayList _singleTags = new ArrayList();
  18. private string _urlPlaceholder = "[** URL_ROOT **]";
  19.  
  20. public HtmlFormatter(string content)
  21. {
  22. this._contents = content;
  23. this.Setup();
  24. }
  25.  
  26. private void Setup()
  27. {
  28. // create a lookup table for tags:
  29. // key = tag
  30. // -1 = strip tag and contents inside tag completely
  31. // 0 = allow tag, no attributes
  32. // 1 = allow tag with attributes
  33. // N.B., all other tags should be stripped
  34. this._tagMap.Add("head", -1);
  35. this._tagMap.Add("select", -1);
  36. this._tagMap.Add("input", -1);
  37. this._tagMap.Add("script", -1);
  38. this._tagMap.Add("noscript", -1);
  39. this._tagMap.Add("xmp", -1);
  40. this._tagMap.Add("style", -1);
  41. this._tagMap.Add("a", 1);
  42. this._tagMap.Add("table", 1);
  43. this._tagMap.Add("tr", 1);
  44. this._tagMap.Add("th", 1);
  45. this._tagMap.Add("td", 1);
  46. this._tagMap.Add("ul", 0);
  47. this._tagMap.Add("ol", 0);
  48. this._tagMap.Add("li", 0);
  49. this._tagMap.Add("p", 1);
  50. this._tagMap.Add("xml", 1);
  51. this._tagMap.Add("img", 1);
  52. this._tagMap.Add("br", 0);
  53. this._tagMap.Add("hr", 0);
  54. this._tagMap.Add("b", 0);
  55. this._tagMap.Add("strong", 0);
  56. this._tagMap.Add("i", 0);
  57. this._tagMap.Add("u", 0);
  58. this._tagMap.Add("strike", 0);
  59. this._tagMap.Add("sup", 0);
  60. this._tagMap.Add("sub", 0);
  61. this._tagMap.Add("iframe", 1);
  62. }
  63.  
  64. #region IFormatter Members
  65.  
  66. public string Render()
  67. {
  68. // fix links
  69. formatMe = this.ReplaceRelativeUrlPlaceholder(formatMe);
  70.  
  71. // find assets
  72.  
  73. return formatMe;
  74. }
  75.  
  76. public string Clean()
  77. {
  78. string formatMe = this.CleanTags(this._contents);
  79. return formatMe;
  80. }
  81. #endregion
  82.  
  83. #region HTML Markup Handling
  84.  
  85. private string InsertRelativeUrlPlaceholder(string input)
  86. {
  87. string formatMe = input;
  88. string searchTerm = System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"];
  89.  
  90. if (formatMe.IndexOf(searchTerm) > -1)
  91. {
  92. Regex reg = new Regex(searchTerm);
  93. MatchCollection matches = reg.Matches(formatMe);
  94. foreach (Match m in matches)
  95. {
  96. formatMe = formatMe.Replace(m.ToString(), this._urlPlaceholder);
  97. }
  98. }
  99. return formatMe;
  100. }
  101.  
  102. private string ReplaceRelativeUrlPlaceholder(string input)
  103. {
  104. string formatMe = input;
  105. formatMe = formatMe.Replace(this._urlPlaceholder, System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]);
  106. return formatMe;
  107. }
  108.  
  109. private string TidyHTML(string input)
  110. {
  111. Tidy.Document doc = new Tidy.Document();
  112.  
  113. //doc.OnMessage += new Tidy.IDocumentEvents_OnMessageEventHandler(TidyDiagnostics);
  114.  
  115. // set some options
  116. doc.SetOptBool(TidyOptionId.TidyBodyOnly, 1);
  117. doc.SetOptBool(TidyOptionId.TidyXhtmlOut, 1);
  118. doc.SetOptBool(TidyOptionId.TidyWord2000, 1);
  119. doc.SetOptValue(TidyOptionId.TidyIndentContent, "auto");
  120.  
  121.  
  122. int err_code = doc.ParseString(input);
  123. if (err_code < 0)
  124. {
  125. throw new Exception("Unable to parse string: " + input);
  126. }
  127.  
  128. err_code = doc.CleanAndRepair();
  129.  
  130. if (err_code < 0)
  131. {
  132. throw new Exception("Unable to clean/repair string: " + input);
  133. }
  134.  
  135. //err_code = doc.RunDiagnostics();
  136.  
  137. //if (err_code < 0)
  138. //{
  139. // throw new Exception("Unable to run diagnostics on: " + input);
  140. //}
  141.  
  142. return(doc.SaveString().Trim());
  143. }
  144.  
  145. public void TidyDiagnostics(TidyATL.TidyReportLevel level, int line, int col, string message)
  146. {
  147. Console.WriteLine("Tidy diagnostic message: " + message);
  148. }
  149.  
  150. private int InStrEndOfTag (string input)
  151. {
  152. bool attr = false;
  153. int pos = 0;
  154.  
  155. while (pos < input.Length)
  156. {
  157. pos++;
  158. if (!attr && (input.Substring(pos, 1) == ">"))
  159. {
  160. return pos;
  161. }
  162.  
  163. if (input.Substring(pos, 1) == "")
  164. {
  165. attr = !attr;
  166. }
  167. }
  168.  
  169. return pos;
  170. }
  171.  
  172. private string RemoveExtraTags(string input)
  173. {
  174. string temp = input;
  175. string output = "";
  176. int pos;
  177. string tag, name;
  178.  
  179. while (temp != "")
  180. {
  181. if (temp.Substring(0, 1) == "<")
  182. {
  183. pos = InStrEndOfTag(temp);
  184. if (pos == 0)
  185. {
  186. tag = temp.Substring(2);
  187. temp = "";
  188. }
  189. else
  190. {
  191. tag = temp.Substring(1, pos - 1);
  192. temp = temp.Substring(pos + 1);
  193. }
  194.  
  195. name = tag.Split(new Char[] { ' ' })[0].ToLower();
  196.  
  197. if (name.Substring(0, 1) == "/")
  198. {
  199. name = name.Substring(1);
  200. }
  201.  
  202. if (this._tagMap.Contains(name))
  203. {
  204. int val = Convert.ToInt32(this._tagMap[name].ToString());
  205. switch (val)
  206. {
  207. case -1:
  208. pos = temp.ToLower().IndexOf("</" + name + ">");
  209. if (pos > 0)
  210. {
  211. temp = temp.Substring(pos + name.Length + 3);
  212. }
  213. break;
  214. case 0:
  215. output += "<";
  216. if (tag.Substring(0, 1) == "/")
  217. {
  218. output += "/";
  219. }
  220. output += name + ">";
  221. break;
  222. case 1:
  223. output += "<" + tag + ">";
  224. break;
  225. default:
  226. break;
  227. }
  228. }
  229. }
  230. else
  231. {
  232. output += temp.Substring(0,1);
  233. temp = temp.Substring(1);
  234. }
  235. }
  236.  
  237. return output;
  238. }
  239.  
  240. private string CleanTags(string input)
  241. {
  242. // run HTML Tidy on content
  243. string formatMe = this.TidyHTML(input.Trim());
  244.  
  245. // get rid of comments first to make tag balancing a little easier
  246. formatMe = this.StripComments(formatMe);
  247.  
  248. // remove attributes that are unacceptable in any case (e.g., JavaScript attributes, CSS)
  249. formatMe = this.ReplaceNastyAttributes(formatMe);
  250.  
  251. formatMe = RemoveExtraTags(formatMe);
  252.  
  253. // substitute placeholder for relative links
  254. formatMe = this.InsertRelativeUrlPlaceholder(formatMe);
  255.  
  256. return formatMe;
  257. }
  258.  
  259. private string StripComments(string input)
  260. {
  261. Regex regex = new Regex("<!--.*-->");
  262. return regex.Replace(input, "");
  263. }
  264.  
  265. private string ReplaceNastyAttributes(string input)
  266. {
  267. Regex regex = new Regex("( on[a-z]{1,}|style|id)=[\"'](.*?)[\"']");
  268. return regex.Replace(input, "");
  269. }
  270.  
  271. #endregion HTML Markup Handling
  272.  
  273. }
  274. }

Report this snippet  

You need to login to post a comment.