AyeEmtract - PHP Class that implements Curl in Parallel to extract Email Addresses


/ Published in: PHP
Save to your folder(s)

<a href="http://pantuts.com/2013/10/24/ayeemtract-using-curl-parallel/">http://pantuts.com/2013/10/24/ayeemtract-using-curl-parallel/</a>


Copy this code and paste it in your HTML
  1. <?php
  2.  
  3. /**
  4.  * AyeEmtract a class that crawls pages and
  5.  * then searches for email addresses using curl in parallel.
  6.  *
  7.  * PHP version >= 5.x
  8.  *
  9.  * @category PHP
  10.  * @author PANTUTS
  11.  * @license http://www.gnu.org/licenses/gpl.txt
  12.  * @link http://www.pantuts.com
  13.  */
  14. class AyeEmtract
  15. {
  16. // variable to store data
  17. private $_results = array();
  18. // array of urls
  19. private $_urls = array();
  20. // addition curl options
  21. private $_options = array();
  22. // errors
  23. public $errors = array();
  24.  
  25. /**
  26.   * Creates object and store array of urls and additional options
  27.   *
  28.   * @param array $urls array of urls
  29.   * @param array $options addition curl options
  30.   */
  31. public function __construct($urls, $options = array())
  32. {
  33. $this->_urls = $urls;
  34. $this->_options = $options;
  35. }
  36.  
  37. /**
  38.   * Sets curl_multi, curl options, and executes curl
  39.   * per url.
  40.   */
  41. public function start()
  42. {
  43. // initialize curl multi handle
  44. $curlMaster = curl_multi_init();
  45. // curl singles array
  46. $curlh = array();
  47.  
  48. // loop all urls
  49. foreach ($this->_urls as $i => $url) {
  50. // curl_init each url
  51. $curlh[$i] = curl_init();
  52.  
  53. $curlh[$i], array(
  54. CURLOPT_URL => htmlentities(trim($url)),
  55. CURLOPT_SSL_VERIFYPEER => false,
  56. CURLOPT_USERAGENT => self::_setUserAgent(),
  57. CURLOPT_HEADER => false,
  58. CURLOPT_RETURNTRANSFER => true,
  59. CURLOPT_MAXREDIRS => 7,
  60. CURLOPT_CONNECTTIMEOUT => 20,
  61. CURLOPT_FRESH_CONNECT => true )
  62. );
  63.  
  64. // check additional options
  65. if (!empty($this->_options)) {
  66. curl_setopt_array($curlh[$i], $this->_options);
  67. }
  68. // now add multi handle
  69. curl_multi_add_handle($curlMaster, $curlh[$i]);
  70. }
  71.  
  72. // execute multi handles
  73. $running = null;
  74. do {
  75. // save errors if encountered
  76. if (curl_multi_exec($curlMaster, $running) === false) {
  77. $this->errors[$i] = 'ERROR: url = ' . curl_error($curlh[$i]) .
  78. ', code = ' . curl_errno($curlh[$i]);
  79. }
  80. } while ($running > 0);
  81.  
  82. // get content on each url from curlh
  83. foreach ($curlh as $j => $ch) {
  84. // save results and remove handle
  85. $this->_results[$j] = curl_multi_getcontent($ch);
  86.  
  87. curl_multi_remove_handle($curlMaster, $ch);
  88. }
  89.  
  90. // close handle
  91. curl_multi_close($curlMaster);
  92. }
  93.  
  94. /**
  95.   * Returns array of emails
  96.   *
  97.   * @return array_unique(array)
  98.   */
  99. public function getEmails()
  100. {
  101. $emails = array();
  102. // regex to find valid emails
  103. $re = "/([\s]*)([_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*([ ]+|)@([ ]+|)([a-zA-Z0-9-]+\.)+([a-zA-Z]{2,}))([\s]*)/i";
  104.  
  105. foreach ($this->_results as $i => $res) {
  106. // decode htmlentities like %3C
  107. $res = html_entity_decode($res);
  108. preg_match_all($re, $res, $matches);
  109.  
  110. foreach ($matches[0] as $match) {
  111. // save to array found emails
  112. $emails[$i] = trim($match);
  113. }
  114. }
  115. // remove duplicates
  116. return array_unique($emails);
  117. }
  118.  
  119. /**
  120.   * Returns random user-agent
  121.   *
  122.   * @return $ua
  123.   */
  124. private static function _setUserAgent()
  125. {
  126. $userAgents = array(
  127. 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
  128. 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0',
  129. 'Opera/9.80 (J2ME/MIDP; Opera Mini/5.0 (Windows; U; Windows NT 5.1; en) AppleWebKit/886; U; en) Presto/2.4.15',
  130. 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
  131. 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))',
  132. 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
  133. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36',
  134. 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36',
  135. 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25',
  136. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
  137. );
  138. $ua = $userAgents[array_rand($userAgents)];
  139. return $ua;
  140. }
  141. }
  142. ?>

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.