/ Published in: C#
Shows how to use WebClient class to retrieve HTML from an URL and then to use HtmlAgilityPack to parse it.
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
using System; using System.Net; using System.Collections.Generic; using System.Linq; using HtmlAgilityPack; namespace PixarWebClient { public class PixarWebClient { public static void Main(string[] args) { { // fetching HTML string pixarHtml = client.DownloadString("http://en.wikipedia.org/wiki/List_of_Pixar_films"); document.LoadHtml(pixarHtml); HtmlNode pixarTable = (from d in document.DocumentNode.Descendants() where d.Name == "table" && d.Attributes["class"].Value == "sortable wikitable" select d).First(); IEnumerable<HtmlNode> pixarRows = from d in pixarTable.Descendants() where d.Name == "tr" select d; // removing first row that contains header information pixarRows.ElementAt(0).Remove(); foreach (HtmlNode row in pixarRows) { IEnumerable<HtmlNode> columns = from d in row.Descendants() where d.Name == "td" select d; int count = 0; string title = string.Empty; foreach (HtmlNode column in columns) { if (count > 1) break; if (count == 0) { title = column.Element("i").Element("a").InnerText; } else { Console.WriteLine(column.InnerText + " - " + title); } count++; } } } } } }
URL: http://htmlagilitypack.codeplex.com/