Posted By

Sverri on 10/16/10


Tagged

extracturls


Versions (?)

Who likes this?

2 people have marked this snippet as a favorite

laforge
jamepaul


Extract Urls From Website


 / Published in: Python
 

  1. import re
  2. import urllib.request
  3.  
  4. # Retrieves URLs from the HTML source code of a website
  5. def extractUrlsFrom(url, unique=True, sort=True, restrictToTld=None):
  6. # Prepend "www." if not present
  7. if url[0:4] != "www.":
  8. url = "".join(["www.",url])
  9. # Open a connection
  10. with urllib.request.urlopen("http://" + url) as h:
  11. # Grab the headers
  12. headers = h.info()
  13. # Default charset
  14. charset = "ISO-8859-1"
  15. # If a charset is in the headers then override the default
  16. for i in headers:
  17. match = re.search(r"charset=([\w\-]+)", headers[i], re.I)
  18. if match != None:
  19. charset = match.group(1).lower()
  20. break
  21. # Grab and decode the source code
  22. source = h.read().decode(charset)
  23. # Find all URLs in the source code
  24. matches = re.findall(r"http\:\/\/(www.)?([a-z0-9\-\.]+\.[a-z]{2,6})\b", source, re.I)
  25. # Abort if no URLs were found
  26. if matches == None:
  27. return None
  28. # Collect URLs
  29. collection = []
  30. # Go over URLs one by one
  31. for url in matches:
  32. url = url[1].lower()
  33. # If there are more than one dot then the URL contains
  34. # subdomain(s), which we remove
  35. if url.count(".") > 1:
  36. temp = url.split(".")
  37. tld = temp.pop()
  38. url = "".join([temp.pop(),".",tld])
  39. # Restrict to TLD if one is set
  40. if restrictToTld:
  41. tld = url.split(".").pop()
  42. if tld != restrictToTld:
  43. continue
  44. # If only unique URLs should be returned
  45. if unique:
  46. if url not in collection:
  47. collection.append(url)
  48. # Otherwise just add the URL to the collection
  49. else:
  50. collection.append(url)
  51. # Done
  52. return sorted(collection) if sort else collection
  53.  
  54. # Test
  55. url = "snipplr.com"
  56. print("From:", url)
  57. for x in extractUrlsFrom(url):
  58. print("-", x)

Report this snippet  

You need to login to post a comment.