Posted By

jerryvig on 01/27/12


Tagged

curl html scrape parsing c++ libcurl htmlcxx nasdaq ipos


Versions (?)

Nasdaq SPOs Scrape - Secondary Offerings


 / Published in: C++
 

This scrapes secondary offerings from nasdaq.

  1. #include <string>
  2. #include <sstream>
  3. #include <iostream>
  4. #include <fstream>
  5. #include <vector>
  6. #include <algorithm>
  7. #include <stdlib.h>
  8.  
  9. #include <curlpp/cURLpp.hpp>
  10. #include <curlpp/Easy.hpp>
  11. #include <curlpp/Options.hpp>
  12. #include <htmlcxx/html/ParserDom.h>
  13.  
  14. using namespace std;
  15. using namespace htmlcxx;
  16.  
  17. int main() {
  18. vector<string> tabStrings;
  19. tabStrings.push_back( "pricings" );
  20. tabStrings.push_back( "filings" );
  21.  
  22. vector<string> monthStrings;
  23. for ( int i=1; i<=12; i++ ) {
  24. stringstream out;
  25. out << i;
  26. if ( i<10 ) {
  27. monthStrings.push_back("2011-0"+out.str());
  28. }
  29. else {
  30. monthStrings.push_back("2011-"+out.str());
  31. }
  32. }
  33. monthStrings.push_back("2012-01");
  34.  
  35. fstream outFile( "/tmp/NasdaqSPOs.csv", fstream::out );
  36.  
  37. for ( vector<string>::iterator iter=tabStrings.begin(); iter!=tabStrings.end(); iter++ ) {
  38. for ( vector<string>::iterator monthIter=monthStrings.begin(); monthIter!=monthStrings.end(); monthIter++ ) {
  39. cout << "Scraping " << *iter << " for month = " << *monthIter << endl;
  40. try {
  41. curlpp::Easy myRequest;
  42. myRequest.setOpt(curlpp::options::Url((std::string( "http://www.nasdaq.com/markets/spos/activity.aspx?tab="+*iter+"&month="+*monthIter))));
  43. ostringstream os;
  44. os << myRequest;
  45. string content = os.str();
  46.  
  47. HTML::ParserDom parser;
  48.  
  49. if ( *iter == "pricings" ) {
  50. string type = "PRICING";
  51.  
  52. int startIdx = content.find("<div class=\"genTable\">");
  53. int endIdx = content.find("<!-- end tabpane");
  54.  
  55. if ( startIdx > 0 && endIdx > startIdx ) {
  56. string htmlContent = content.substr(startIdx,(endIdx-startIdx));
  57. tree<HTML::Node> dom = parser.parseTree( htmlContent );
  58.  
  59. for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
  60. if ( treeIter->tagName() == "tr" ) {
  61. string trHtml = treeIter->content( htmlContent );
  62. tree<HTML::Node> trDom = parser.parseTree( trHtml );
  63. int tdCount = 0;
  64. string name = "";
  65. string ticker = "";
  66. string market = "";
  67. string price = "";
  68. string shares = "";
  69. string offerAmount = "";
  70. string datePriced = "";
  71.  
  72. for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
  73. if ( trIter->tagName() == "td" ) {
  74. if ( tdCount == 0 ) {
  75. string tdHtml = trIter->content( trHtml );
  76. int startIndex = tdHtml.find("\">");
  77. int endIndex = tdHtml.find("</a>");
  78. name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  79. }
  80. else if ( tdCount == 1 ) {
  81. string tdHtml = trIter->content( trHtml );
  82. int startIndex = tdHtml.find("\">");
  83. int endIndex = tdHtml.find("</a>");
  84. ticker = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  85. }
  86. else if ( tdCount == 2 ) {
  87. string tdHtml = trIter->content( trHtml );
  88. int startIndex = tdHtml.find("\">");
  89. int endIndex = tdHtml.find("</a>");
  90. market = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  91. }
  92. else if ( tdCount == 3 ) {
  93. string tdHtml = trIter->content( trHtml );
  94. price = tdHtml;
  95. }
  96. else if ( tdCount == 4 ) {
  97. string tdHtml = trIter->content( trHtml );
  98. shares = tdHtml;
  99. }
  100. else if ( tdCount == 5 ) {
  101. string tdHtml = trIter->content( trHtml );
  102. offerAmount = tdHtml;
  103. }
  104. else if ( tdCount == 6 ) {
  105. string tdHtml = trIter->content( trHtml );
  106. datePriced = tdHtml;
  107. }
  108. tdCount++;
  109. }
  110. }
  111. if ( name != "" ) {
  112. outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + datePriced + "\"" << endl;
  113. }
  114. }
  115. }
  116. }
  117. }
  118. else if ( *iter == "filings" ) {
  119. string type = "FILING";
  120. int startIdx = content.find("<div class=\"genTable\">");
  121. int endIdx = content.find("<!-- end tabpane");
  122.  
  123. if ( startIdx > 0 && endIdx > startIdx ) {
  124. string htmlContent = content.substr(startIdx,(endIdx-startIdx));
  125. tree<HTML::Node> dom = parser.parseTree( htmlContent );
  126.  
  127. for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
  128. if ( treeIter->tagName() == "tr" ) {
  129. string trHtml = treeIter->content( htmlContent );
  130. tree<HTML::Node> trDom = parser.parseTree( trHtml );
  131. int tdCount = 0;
  132. string name = "";
  133. string ticker = "";
  134. string offerAmount = "";
  135. string dateFiled = "";
  136. string market = "";
  137. string price = "";
  138. string shares = "";
  139.  
  140. for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
  141. if ( trIter->tagName() == "td" ) {
  142. if ( tdCount == 0 ) {
  143. string tdHtml = trIter->content( trHtml );
  144. int startIndex = tdHtml.find("\">");
  145. int endIndex = tdHtml.find("</a>");
  146. name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
  147. }
  148. else if ( tdCount == 1 ) {
  149. string tdHtml = trIter->content( trHtml );
  150. ticker = tdHtml;
  151. try {
  152. if ( ticker.find("</a>") != string::npos ) {
  153. int startIndex = ticker.find("\">");
  154. int endIndex = ticker.find("</a>");
  155. ticker = ticker.substr(startIndex+2,(endIndex-startIndex-2));
  156. }
  157. } catch ( std::out_of_range &e ) {}
  158. }
  159. else if ( tdCount == 2 ) {
  160. string tdHtml = trIter->content( trHtml );
  161. offerAmount = tdHtml;
  162. }
  163. else if ( tdCount == 3 ) {
  164. string tdHtml = trIter->content( trHtml );
  165. dateFiled = tdHtml;
  166. }
  167. tdCount++;
  168. }
  169. }
  170. if ( name != "" ) {
  171. outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + dateFiled + "\"" << endl;
  172. }
  173. }
  174. }
  175. }
  176.  
  177. }
  178. } catch( curlpp::RuntimeError &e ) { std::cout << e.what() << std::endl; }
  179. catch( curlpp::LogicError &e ) { std::cout << e.what() << std::endl; }
  180. catch( std::out_of_range &e ) { cout << e.what() << endl; }
  181. usleep( 125000 );
  182. }
  183. }
  184. outFile.close();
  185. return 0;
  186. }

Report this snippet  

You need to login to post a comment.