Posted By

jerryvig on 01/27/12


Tagged

curl html scrape parsing c++ libcurl htmlcxx nasdaq ipos


Versions (?)

Nasdaq IPOs Scrape


 / Published in: C++
 

This is a C++ program that scrapes IPO data from Nasdaq.com using libcurl for downloading the data and htmlcxx for parsing the html code.

  1. #include <string>
  2. #include <sstream>
  3. #include <iostream>
  4. #include <fstream>
  5. #include <vector>
  6. #include <algorithm>
  7.  
  8. #include <curlpp/cURLpp.hpp>
  9. #include <curlpp/Easy.hpp>
  10. #include <curlpp/Options.hpp>
  11. #include <htmlcxx/html/ParserDom.h>
  12.  
  13. using namespace std;
  14. using namespace htmlcxx;
  15.  
  16. int main() {
  17. vector<string> tabStrings;
  18. tabStrings.push_back( "pricings" );
  19. tabStrings.push_back( "upcoming" );
  20. tabStrings.push_back( "filings" );
  21.  
  22. vector<string> monthStrings;
  23. monthStrings.push_back("2011-01");
  24. monthStrings.push_back("2011-02");
  25. monthStrings.push_back("2011-03");
  26. monthStrings.push_back("2011-04");
  27. monthStrings.push_back("2011-05");
  28. monthStrings.push_back("2011-06");
  29. monthStrings.push_back("2011-07");
  30. monthStrings.push_back("2011-08");
  31. monthStrings.push_back("2011-09");
  32. monthStrings.push_back("2011-10");
  33. monthStrings.push_back("2011-11");
  34. monthStrings.push_back("2011-12");
  35. monthStrings.push_back("2012-01");
  36.  
  37. fstream outFile( "/tmp/NasdaqIPOs.csv", fstream::out );
  38.  
  39. for ( vector<string>::iterator iter = tabStrings.begin(); iter != tabStrings.end(); iter++ ) {
  40. for ( vector<string>::iterator monthIter=monthStrings.begin(); monthIter!=monthStrings.end(); monthIter++ ) {
  41. cout << "Scraping " << *iter << " for month = " << *monthIter << endl;
  42. try {
  43. curlpp::Easy myRequest;
  44. myRequest.setOpt(curlpp::options::Url((std::string( "http://www.nasdaq.com/markets/ipos/activity.aspx?tab="+*iter+"&month="+*monthIter))));
  45. ostringstream os;
  46. os << myRequest;
  47. string content = os.str();
  48.  
  49. HTML::ParserDom parser;
  50.  
  51. if ( *iter == "pricings" || *iter == "upcoming" ) {
  52. string type = "";
  53. if ( *iter == "pricings" ) {
  54. type = "PRICING";
  55. }
  56. else if ( *iter == "upcoming" ) {
  57. type = "UPCOMING";
  58. }
  59.  
  60. int startIdx = content.find("<div class=\"genTable\">");
  61. int endIdx = content.find("<!-- end tabpane");
  62.  
  63. if ( startIdx > 0 && endIdx > startIdx ) {
  64. string htmlContent = content.substr(startIdx,(endIdx-startIdx));
  65. tree<HTML::Node> dom = parser.parseTree( htmlContent );
  66.  
  67. for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
  68. if ( treeIter->tagName() == "tr" ) {
  69. string trHtml = treeIter->content( htmlContent );
  70. tree<HTML::Node> trDom = parser.parseTree( trHtml );
  71. int tdCount = 0;
  72. string name = "";
  73. string ticker = "";
  74. string market = "";
  75. string price = "";
  76. string shares = "";
  77. string offerAmount = "";
  78. string datePriced = "";
  79.  
  80. for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
  81. if ( trIter->tagName() == "td" ) {
  82. if ( tdCount == 0 ) {
  83. string tdHtml = trIter->content( trHtml );
  84. int startIndex = tdHtml.find("\">");
  85. int endIndex = tdHtml.find("</a>");
  86. name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  87. }
  88. else if ( tdCount == 1 ) {
  89. string tdHtml = trIter->content( trHtml );
  90. int startIndex = tdHtml.find("\">");
  91. int endIndex = tdHtml.find("</a>");
  92. ticker = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  93. }
  94. else if ( tdCount == 2 ) {
  95. string tdHtml = trIter->content( trHtml );
  96. int startIndex = tdHtml.find("\">");
  97. int endIndex = tdHtml.find("</a>");
  98. market = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  99. }
  100. else if ( tdCount == 3 ) {
  101. string tdHtml = trIter->content( trHtml );
  102. price = tdHtml;
  103. }
  104. else if ( tdCount == 4 ) {
  105. string tdHtml = trIter->content( trHtml );
  106. shares = tdHtml;
  107. }
  108. else if ( tdCount == 5 ) {
  109. string tdHtml = trIter->content( trHtml );
  110. offerAmount = tdHtml;
  111. }
  112. else if ( tdCount == 6 ) {
  113. string tdHtml = trIter->content( trHtml );
  114. datePriced = tdHtml;
  115. }
  116. tdCount++;
  117. }
  118. }
  119. if ( name != "" ) {
  120. outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + datePriced + "\"" << endl;
  121. }
  122. }
  123. }
  124. }
  125. }
  126. else if ( *iter == "filings" ) {
  127. string type = "FILING";
  128. int startIdx = content.find("<div class=\"genTable\">");
  129. int endIdx = content.find("<!-- end tabpane");
  130.  
  131. if ( startIdx > 0 && endIdx > startIdx ) {
  132. string htmlContent = content.substr(startIdx,(endIdx-startIdx));
  133. tree<HTML::Node> dom = parser.parseTree( htmlContent );
  134.  
  135. for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
  136. if ( treeIter->tagName() == "tr" ) {
  137. string trHtml = treeIter->content( htmlContent );
  138. tree<HTML::Node> trDom = parser.parseTree( trHtml );
  139. int tdCount = 0;
  140. string name = "";
  141. string ticker = "";
  142. string offerAmount = "";
  143. string dateFiled = "";
  144.  
  145. string market = "";
  146. string price = "";
  147. string shares = "";
  148.  
  149. for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
  150. if ( trIter->tagName() == "td" ) {
  151. if ( tdCount == 0 ) {
  152. string tdHtml = trIter->content( trHtml );
  153. int startIndex = tdHtml.find("\">");
  154. int endIndex = tdHtml.find("</a>");
  155. name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  156. }
  157. else if ( tdCount == 1 ) {
  158. string tdHtml = trIter->content( trHtml );
  159. ticker = tdHtml;
  160. try {
  161. if ( ticker.find("</a>") != string::npos ) {
  162. int startIndex = ticker.find("\">");
  163. int endIndex = ticker.find("</a>");
  164. ticker = ticker.substr(startIndex+2,(endIndex-startIndex-2));
  165. }
  166. } catch ( std::out_of_range &e ) {}
  167. }
  168. else if ( tdCount == 2 ) {
  169. string tdHtml = trIter->content( trHtml );
  170. offerAmount = tdHtml;
  171. }
  172. else if ( tdCount == 3 ) {
  173. string tdHtml = trIter->content( trHtml );
  174. dateFiled = tdHtml;
  175. }
  176. tdCount++;
  177. }
  178. }
  179. if ( name != "" ) {
  180. outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + dateFiled + "\"" << endl;
  181. }
  182. }
  183. }
  184. }
  185.  
  186. }
  187. } catch( curlpp::RuntimeError &e ) { std::cout << e.what() << std::endl; }
  188. catch( curlpp::LogicError &e ) { std::cout << e.what() << std::endl; }
  189. catch( std::out_of_range &e ) { cout << e.what() << endl; }
  190. usleep( 200000 );
  191. }
  192. }
  193. outFile.close();
  194. return 0;
  195. }

Report this snippet  

You need to login to post a comment.