.NET - C# - WebClient and HtmlAgilityPack - Fetching and Parsing HTML


/ Published in: C#
Save to your folder(s)

Shows how to use WebClient class to retrieve HTML from an URL and then to use HtmlAgilityPack to parse it.


Copy this code and paste it in your HTML
  1. using System;
  2. using System.Net;
  3. using System.Collections.Generic;
  4.  
  5. using System.Linq;
  6. using HtmlAgilityPack;
  7.  
  8. namespace PixarWebClient
  9. {
  10. public class PixarWebClient
  11. {
  12. public static void Main(string[] args)
  13. {
  14. using (WebClient client = new WebClient())
  15. {
  16. // fetching HTML
  17. string pixarHtml = client.DownloadString("http://en.wikipedia.org/wiki/List_of_Pixar_films");
  18.  
  19. HtmlDocument document = new HtmlDocument();
  20. document.LoadHtml(pixarHtml);
  21.  
  22. HtmlNode pixarTable = (from d in document.DocumentNode.Descendants()
  23. where d.Name == "table" && d.Attributes["class"].Value == "sortable wikitable"
  24. select d).First();
  25.  
  26. IEnumerable<HtmlNode> pixarRows = from d in pixarTable.Descendants() where d.Name == "tr" select d;
  27.  
  28. // removing first row that contains header information
  29. pixarRows.ElementAt(0).Remove();
  30.  
  31. foreach (HtmlNode row in pixarRows)
  32. {
  33. IEnumerable<HtmlNode> columns = from d in row.Descendants() where d.Name == "td" select d;
  34.  
  35. int count = 0;
  36. string title = string.Empty;
  37.  
  38. foreach (HtmlNode column in columns)
  39. {
  40. if (count > 1)
  41. break;
  42.  
  43. if (count == 0) {
  44. title = column.Element("i").Element("a").InnerText;
  45. } else {
  46. Console.WriteLine(column.InnerText + " - " + title);
  47. }
  48.  
  49. count++;
  50. }
  51. }
  52. }
  53. }
  54. }
  55. }

URL: http://htmlagilitypack.codeplex.com/

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.