Posted By

danfsmith on 08/25/11


Tagged

html pdf


Versions (?)

Get All PDF links from HTML files


 / Published in: Windows PowerShell
 

  1. param ($path, $urlpath)
  2.  
  3. add-type -Path f:\dan\tools\html-agility-pack\HtmlAgilityPack.dll
  4. $files = Get-ChildItem -Include *.htm,*.aspx -Path $path -Recurse
  5. $doc = New-Object HtmlAgilityPack.HtmlDocument
  6. $result = $files | % {
  7. Write-Host "Checking $_"
  8. $name = $_\3.FullName.Replace($path,$urlpath).Replace("\", "/")
  9. $htmldoc = $doc.Load($_\3.FullName)
  10. $linknodes = $doc.DocumentNode.SelectNodes("//a")
  11. if ($linknodes) {
  12. foreach ($node in $linknodes) {
  13. if ($node.GetAttributeValue("href", "").ToLower().Contains("pdf"))
  14. {
  15. Write-Host "Found" $node.GetAttributeValue("href", "")
  16. $pdflink = $node.GetAttributeValue("href", "")
  17. $line = $node.Line
  18. New-Object PsObject -Property @{PdfLink = $pdflink; FileName = $name; LineNumber = $line;}
  19. }
  20. }
  21. }
  22. }
  23. $result | Sort PdfLink

Report this snippet  

You need to login to post a comment.