Posted By

denakitan on 11/05/12

Tagged

Versions (?)

Last Edited at 11/05/12 02:50pm

Statistics

Viewed 1736 times

Favorited by 1 user(s)

Related snippets

.NET - C# - WebClient and HtmlAgilityPack - Fetching and Parsing HTML

/ Published in: C#

Shows how to use WebClient class to retrieve HTML from an URL and then to use HtmlAgilityPack to parse it.

Expand | Embed | Plain Text

Copy this code and paste it in your HTML

using System;
using System.Net;
using System.Collections.Generic;
 
using System.Linq;
using HtmlAgilityPack;
 
namespace PixarWebClient
{
    public class PixarWebClient
    {
        public static void Main(string[] args)
        {
            using (WebClient client = new WebClient())
            {
                // fetching HTML
                string pixarHtml = client.DownloadString("http://en.wikipedia.org/wiki/List_of_Pixar_films");
 
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(pixarHtml);
 
                HtmlNode pixarTable = (from d in document.DocumentNode.Descendants()
                                      where d.Name == "table" && d.Attributes["class"].Value == "sortable wikitable"
                                      select d).First();
 
                IEnumerable<HtmlNode> pixarRows = from d in pixarTable.Descendants() where d.Name == "tr" select d;
 
                // removing first row that contains header information
                pixarRows.ElementAt(0).Remove();
 
                foreach (HtmlNode row in pixarRows)
                {
                    IEnumerable<HtmlNode> columns = from d in row.Descendants() where d.Name == "td" select d;
 
                    int count = 0;
                    string title = string.Empty;
 
                    foreach (HtmlNode column in columns)
                    {
                        if (count > 1)
                            break;
 
                        if (count == 0) {
                            title = column.Element("i").Element("a").InnerText;
                        } else {
                            Console.WriteLine(column.InnerText + " - " + title);
                        }
 
                        count++;
                    }
                }
            }
        }
    }
}