Return to Snippet

Revision: 23571
at February 9, 2010 12:35 by tclancy


Updated Code
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

using TidyATL;

using ProjectName.Core.Interfaces;

namespace ProjectName.Core.Content
{
    public class HtmlFormatter : IFormatter
    {
        private string _contents;
        private Hashtable _tagMap = new Hashtable();
        private ArrayList _singleTags = new ArrayList();
        private string _urlPlaceholder = "[** URL_ROOT **]";

        public HtmlFormatter(string content)
        {  
            this._contents = content;
            this.Setup();
        }

        private void Setup()
        {
            // create a lookup table for tags:
            // key = tag
            // -1 = strip tag and contents inside tag completely
            //  0 = allow tag, no attributes
            //  1 = allow tag with attributes
            //  N.B., all other tags should be stripped
            this._tagMap.Add("head", -1);
            this._tagMap.Add("select", -1);
            this._tagMap.Add("input", -1);
            this._tagMap.Add("script", -1);
            this._tagMap.Add("noscript", -1);
            this._tagMap.Add("xmp", -1);
            this._tagMap.Add("style", -1);
            this._tagMap.Add("a", 1);
            this._tagMap.Add("table", 1);
            this._tagMap.Add("tr", 1);
            this._tagMap.Add("th", 1);
            this._tagMap.Add("td", 1);
            this._tagMap.Add("ul", 0);
            this._tagMap.Add("ol", 0);
            this._tagMap.Add("li", 0);
            this._tagMap.Add("p", 1);
            this._tagMap.Add("xml", 1);
            this._tagMap.Add("img", 1);
            this._tagMap.Add("br", 0);
            this._tagMap.Add("hr", 0);
            this._tagMap.Add("b", 0);
            this._tagMap.Add("strong", 0);
            this._tagMap.Add("i", 0);
            this._tagMap.Add("u", 0);
            this._tagMap.Add("strike", 0);
            this._tagMap.Add("sup", 0);
            this._tagMap.Add("sub", 0);
            this._tagMap.Add("iframe", 1);
        }

        #region IFormatter Members

        public string Render()
        {
            // fix links
            formatMe = this.ReplaceRelativeUrlPlaceholder(formatMe);

            // find assets

            return formatMe;
        }

        public string Clean()
        {
            string formatMe = this.CleanTags(this._contents);
            return formatMe;
        }
        #endregion

        #region HTML Markup Handling

        private string InsertRelativeUrlPlaceholder(string input)
        {
            string formatMe = input;
            string searchTerm = System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"];

            if (formatMe.IndexOf(searchTerm) > -1) 
            {
                Regex reg = new Regex(searchTerm);
                MatchCollection matches = reg.Matches(formatMe);
                foreach (Match m in matches)
                {
                    formatMe = formatMe.Replace(m.ToString(), this._urlPlaceholder);
                }
            }
            return formatMe;
        }

        private string ReplaceRelativeUrlPlaceholder(string input)
        {
            string formatMe = input;
            formatMe = formatMe.Replace(this._urlPlaceholder, System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]);
            return formatMe;
        }   

        private string TidyHTML(string input)
        {
            Tidy.Document doc = new Tidy.Document();

            //doc.OnMessage += new Tidy.IDocumentEvents_OnMessageEventHandler(TidyDiagnostics);

            // set some options
            doc.SetOptBool(TidyOptionId.TidyBodyOnly, 1);
            doc.SetOptBool(TidyOptionId.TidyXhtmlOut, 1);
            doc.SetOptBool(TidyOptionId.TidyWord2000, 1);
            doc.SetOptValue(TidyOptionId.TidyIndentContent, "auto");


            int err_code = doc.ParseString(input);
            if (err_code < 0)
            {
                throw new Exception("Unable to parse string: " + input);
            }

            err_code = doc.CleanAndRepair();

            if (err_code < 0)
            {
                throw new Exception("Unable to clean/repair string: " + input);
            }

            //err_code = doc.RunDiagnostics();

            //if (err_code < 0)
            //{
            //    throw new Exception("Unable to run diagnostics on: " + input);
            //}

            return(doc.SaveString().Trim());
        }

        public void TidyDiagnostics(TidyATL.TidyReportLevel level, int line, int col, string message)
        {
            Console.WriteLine("Tidy diagnostic message: " + message);
        }

        private int InStrEndOfTag (string input)
        {
            bool attr = false;
            int pos = 0;

            while (pos < input.Length)
            {
                pos++;
                if (!attr && (input.Substring(pos, 1) == ">"))
                {
                    return pos;
                }

                if (input.Substring(pos, 1) == "")
                {
                    attr = !attr;
                }
            }

            return pos;
        }

        private string RemoveExtraTags(string input)
        {
            string temp = input;
            string output = "";
            int pos;
            string tag, name;
            
            while (temp != "")
            {
                if (temp.Substring(0, 1) == "<")
                {
                    pos = InStrEndOfTag(temp);
                    if (pos == 0)
                    {
                        tag = temp.Substring(2);
                        temp = "";
                    }
                    else
                    {
                        tag = temp.Substring(1, pos - 1);
                        temp = temp.Substring(pos + 1);
                    }

                    name = tag.Split(new Char[] { ' ' })[0].ToLower();
                    
                    if (name.Substring(0, 1) == "/")
                    {
                        name = name.Substring(1);
                    }

                    if (this._tagMap.Contains(name))
                    {
                        int val = Convert.ToInt32(this._tagMap[name].ToString());
                        switch (val)
                        {
                            case -1:
                                pos = temp.ToLower().IndexOf("</" + name + ">");
                                if (pos > 0)
                                {
                                    temp = temp.Substring(pos + name.Length + 3);
                                }
                                break;
                            case 0:
                                output += "<";
                                if (tag.Substring(0, 1) == "/")
                                {
                                    output += "/";
                                }
                                output += name + ">";
                                break;
                            case 1:
                                output += "<" + tag + ">";
                                break;
                            default:
                                break;
                        }
                    }
                }
                else
                {
                    output += temp.Substring(0,1);
                    temp = temp.Substring(1);
                }
            }

            return output;
        }

        private string CleanTags(string input)
        {
            // run HTML Tidy on content
            string formatMe = this.TidyHTML(input.Trim());
            
            // get rid of comments first to make tag balancing a little easier
            formatMe = this.StripComments(formatMe);

            // remove attributes that are unacceptable in any case (e.g., JavaScript attributes, CSS)
            formatMe = this.ReplaceNastyAttributes(formatMe);

            formatMe = RemoveExtraTags(formatMe);

            // substitute placeholder for relative links
            formatMe = this.InsertRelativeUrlPlaceholder(formatMe);

            return formatMe;
        }

        private string StripComments(string input)
        {
            Regex regex = new Regex("<!--.*-->");
            return regex.Replace(input, "");
        }

        private string ReplaceNastyAttributes(string input)
        {
            Regex regex = new Regex("( on[a-z]{1,}|style|id)=[\"'](.*?)[\"']");
            return regex.Replace(input, "");
        }

        #endregion HTML Markup Handling

    }
}

Revision: 23570
at February 9, 2010 12:33 by tclancy


Initial Code
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

using TidyATL;

using ProjectName.Core.Interfaces;

namespace ProjectName.Core.Content
{
    public class HtmlFormatter : IFormatter
    {
        private string _contents;
        private Hashtable _tagMap = new Hashtable();
        private ArrayList _singleTags = new ArrayList();
        private string _urlPlaceholder = "[** URL_ROOT **]";

        public HtmlFormatter(string content)
        {  
            this._contents = content;
            this.Setup();
        }

        private void Setup()
        {
            // create a lookup table for tags:
            // key = tag
            // -1 = strip tag and contents inside tag completely
            //  0 = allow tag, no attributes
            //  1 = allow tag with attributes
            //  N.B., all other tags should be stripped
            this._tagMap.Add("head", -1);
            this._tagMap.Add("select", -1);
            this._tagMap.Add("input", -1);
            this._tagMap.Add("script", -1);
            this._tagMap.Add("noscript", -1);
            this._tagMap.Add("xmp", -1);
            this._tagMap.Add("style", -1);
            this._tagMap.Add("a", 1);
            this._tagMap.Add("table", 1);
            this._tagMap.Add("tr", 1);
            this._tagMap.Add("th", 1);
            this._tagMap.Add("td", 1);
            this._tagMap.Add("ul", 0);
            this._tagMap.Add("ol", 0);
            this._tagMap.Add("li", 0);
            this._tagMap.Add("p", 1);
            this._tagMap.Add("xml", 1);
            this._tagMap.Add("img", 1);
            this._tagMap.Add("br", 0);
            this._tagMap.Add("hr", 0);
            this._tagMap.Add("b", 0);
            this._tagMap.Add("strong", 0);
            this._tagMap.Add("i", 0);
            this._tagMap.Add("u", 0);
            this._tagMap.Add("strike", 0);
            this._tagMap.Add("sup", 0);
            this._tagMap.Add("sub", 0);
            this._tagMap.Add("iframe", 1);
        }

        #region IFormatter Members

        public string Render()
        {
            string formatMe = this.ReplaceFormMarkup(this._contents);
            // fix links
            formatMe = this.ReplaceRelativeUrlPlaceholder(formatMe);

            // find assets

            return formatMe;
        }

        public string Clean()
        {
            string formatMe = this.CleanTags(this._contents);
            return formatMe;
        }
        #endregion

        #region HTML Markup Handling

        private string InsertRelativeUrlPlaceholder(string input)
        {
            string formatMe = input;
            string searchTerm = System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"];

            if (formatMe.IndexOf(searchTerm) > -1) 
            {
                Regex reg = new Regex(searchTerm);
                MatchCollection matches = reg.Matches(formatMe);
                foreach (Match m in matches)
                {
                    formatMe = formatMe.Replace(m.ToString(), this._urlPlaceholder);
                }
            }
            return formatMe;
        }

        private string ReplaceRelativeUrlPlaceholder(string input)
        {
            string formatMe = input;
            formatMe = formatMe.Replace(this._urlPlaceholder, System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]);
            return formatMe;
        }   

        private string ReplaceFormMarkup(string input)
        {
            string formatMe = input;

            // replace forms. Looks like: <img class="formIcon" src="http://www.logicacmg.com/pSecured/admin/countries/_app/img/form_icon.gif?f=4" alt="" />
            if (formatMe.IndexOf("form_icon.gif") > -1)
            {
                string formHtml = "";
                string formId;
                Regex reg = new Regex("<img.*src=.*form_icon.gif\\?f=(\\d+).*>");
                MatchCollection matches = reg.Matches(formatMe);
                foreach (Match m in matches)
                {
                    if (m.Groups.Count > 0)
                    {
                        formId = m.Groups[1].ToString();
                        com.logicacmg.www.ProcessRequest FormService = new ProjectName.Core.com.logicacmg.www.ProcessRequest();
                        if (FormService.VerifyForm(formId) == "1")
                        {
                            string currentPostedFormData;
                            try
                            {
                                if (System.Web.HttpContext.Current.Request.Form["formId"] == formId)
                                {
                                    currentPostedFormData = System.Web.HttpContext.Current.Request.Form.ToString();
                                }
                                else
                                {
                                    currentPostedFormData = "formId=" + formId;
                                }
                            }
                            catch (System.NullReferenceException)
                            {
                                currentPostedFormData = "formId=" + formId;
                            }
                            formHtml = FormService.GenerateForm(currentPostedFormData);
                            formatMe = formatMe.Replace(m.ToString(), formHtml);
                        }

                    }
                }
            }

            return formatMe;
        }

        private string TidyHTML(string input)
        {
            Tidy.Document doc = new Tidy.Document();

            //doc.OnMessage += new Tidy.IDocumentEvents_OnMessageEventHandler(TidyDiagnostics);

            // set some options
            doc.SetOptBool(TidyOptionId.TidyBodyOnly, 1);
            doc.SetOptBool(TidyOptionId.TidyXhtmlOut, 1);
            doc.SetOptBool(TidyOptionId.TidyWord2000, 1);
            doc.SetOptValue(TidyOptionId.TidyIndentContent, "auto");


            int err_code = doc.ParseString(input);
            if (err_code < 0)
            {
                throw new Exception("Unable to parse string: " + input);
            }

            err_code = doc.CleanAndRepair();

            if (err_code < 0)
            {
                throw new Exception("Unable to clean/repair string: " + input);
            }

            //err_code = doc.RunDiagnostics();

            //if (err_code < 0)
            //{
            //    throw new Exception("Unable to run diagnostics on: " + input);
            //}

            return(doc.SaveString().Trim());
        }

        public void TidyDiagnostics(TidyATL.TidyReportLevel level, int line, int col, string message)
        {
            Console.WriteLine("Tidy diagnostic message: " + message);
        }

        private int InStrEndOfTag (string input)
        {
            bool attr = false;
            int pos = 0;

            while (pos < input.Length)
            {
                pos++;
                if (!attr && (input.Substring(pos, 1) == ">"))
                {
                    return pos;
                }

                if (input.Substring(pos, 1) == "")
                {
                    attr = !attr;
                }
            }

            return pos;
        }

        private string RemoveExtraTags(string input)
        {
            string temp = input;
            string output = "";
            int pos;
            string tag, name;
            
            while (temp != "")
            {
                if (temp.Substring(0, 1) == "<")
                {
                    pos = InStrEndOfTag(temp);
                    if (pos == 0)
                    {
                        tag = temp.Substring(2);
                        temp = "";
                    }
                    else
                    {
                        tag = temp.Substring(1, pos - 1);
                        temp = temp.Substring(pos + 1);
                    }

                    name = tag.Split(new Char[] { ' ' })[0].ToLower();
                    
                    if (name.Substring(0, 1) == "/")
                    {
                        name = name.Substring(1);
                    }

                    if (this._tagMap.Contains(name))
                    {
                        int val = Convert.ToInt32(this._tagMap[name].ToString());
                        switch (val)
                        {
                            case -1:
                                pos = temp.ToLower().IndexOf("</" + name + ">");
                                if (pos > 0)
                                {
                                    temp = temp.Substring(pos + name.Length + 3);
                                }
                                break;
                            case 0:
                                output += "<";
                                if (tag.Substring(0, 1) == "/")
                                {
                                    output += "/";
                                }
                                output += name + ">";
                                break;
                            case 1:
                                output += "<" + tag + ">";
                                break;
                            default:
                                break;
                        }
                    }
                }
                else
                {
                    output += temp.Substring(0,1);
                    temp = temp.Substring(1);
                }
            }

            return output;
        }

        private string CleanTags(string input)
        {
            // run HTML Tidy on content
            string formatMe = this.TidyHTML(input.Trim());
            
            // get rid of comments first to make tag balancing a little easier
            formatMe = this.StripComments(formatMe);

            // remove attributes that are unacceptable in any case (e.g., JavaScript attributes, CSS)
            formatMe = this.ReplaceNastyAttributes(formatMe);

            formatMe = RemoveExtraTags(formatMe);

            // substitute placeholder for relative links
            formatMe = this.InsertRelativeUrlPlaceholder(formatMe);

            return formatMe;
        }

        private string StripComments(string input)
        {
            Regex regex = new Regex("<!--.*-->");
            return regex.Replace(input, "");
        }

        private string ReplaceNastyAttributes(string input)
        {
            Regex regex = new Regex("( on[a-z]{1,}|style|id)=[\"'](.*?)[\"']");
            return regex.Replace(input, "");
        }

        #endregion HTML Markup Handling

    }
}

Initial URL


Initial Description
Requires TidyATL library for .NET - http://www.devx.com/dotnet/Article/20505/1763/page/2

Initial Title
.NET HTML Formatter

Initial Tags
html, Net

Initial Language
C#