Return to Snippet

Revision: 60360
at November 5, 2012 14:50 by denakitan


Initial Code
using System;
using System.Net;
using System.Collections.Generic;

using System.Linq;
using HtmlAgilityPack;

namespace PixarWebClient
{
    public class PixarWebClient
    {
        public static void Main(string[] args)
        {
            using (WebClient client = new WebClient())
            {
                // fetching HTML
                string pixarHtml = client.DownloadString("http://en.wikipedia.org/wiki/List_of_Pixar_films");
               
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(pixarHtml);
               
                HtmlNode pixarTable = (from d in document.DocumentNode.Descendants()
                                      where d.Name == "table" && d.Attributes["class"].Value == "sortable wikitable"
                                      select d).First();
               
                IEnumerable<HtmlNode> pixarRows = from d in pixarTable.Descendants() where d.Name == "tr" select d;
               
                // removing first row that contains header information
                pixarRows.ElementAt(0).Remove();
               
                foreach (HtmlNode row in pixarRows)
                {
                    IEnumerable<HtmlNode> columns = from d in row.Descendants() where d.Name == "td" select d;
                   
                    int count = 0;
                    string title = string.Empty;
                   
                    foreach (HtmlNode column in columns)
                    {
                        if (count > 1)
                            break;
                               
                        if (count == 0) {
                            title = column.Element("i").Element("a").InnerText;
                        } else {
                            Console.WriteLine(column.InnerText + " - " + title);
                        }
                       
                        count++;
                    }
                }
            }
        }
    }
}

Initial URL
http://htmlagilitypack.codeplex.com/

Initial Description
Shows how to use WebClient class to retrieve HTML from an URL and then to use HtmlAgilityPack to parse it.

Initial Title
.NET - C# - WebClient and HtmlAgilityPack - Fetching and Parsing HTML

Initial Tags
html, Net, c#

Initial Language
C#