Posted By

mandric on 04/26/08


Tagged

Bash yahoo finance Business scraper


Versions (?)

Scrape Yahoo Finance


 / Published in: Bash
 

Not sure why I'm posting this madness. Bash can be good for some things though. I think part of the reason I used bash was because I was calling alot of external commands.

  1. #!/bin/sh
  2. #
  3. # scrape_yahoo.sh
  4. #
  5. # This script pulls data from yahoo using a NYSE index list generated
  6. # from scrape_nyse.sh. It iterates through the list, saving data from each
  7. # security in one big log and seperate per-security log files. This stuff
  8. # should go into a database, sooner or later.
  9.  
  10. DEBUG=0
  11. PATH=$PATH:/usr/local/bin
  12. BASEDIR=/home/stockh/stockharmony.com/api/scripts/
  13. SYMBOLS_FILE=${BASEDIR}../data/nyse_index_symbols.txt
  14. LAST_SYMBOL=`tail -1 $SYMBOLS_FILE`
  15. # the SYMBOL string used in a request to yahoo
  16. SYMBOLS=''
  17. # took out "6t" (the URL)
  18. ARG_STRING=`tr -d '\n' < ${BASEDIR}../data/yahoo_arg_string_custom.txt `
  19. YAHOO_URL='http://finance.yahoo.com/d/quotes.csv?s='
  20. COUNT=0
  21. # how many symbols should we query in one request?
  22. GET_SYMBOLS=15
  23. SLEEP_TIME=60
  24. BEGIN_TIME=`date +%Y%m%d%H%M%S | tr -d '\n'`
  25. DATA_FILE=${BASEDIR}../data/yahoo-finance-${BEGIN_TIME}.csv
  26. TMP_FILE=/tmp/yahoo-data
  27. LOGFILE=${BASEDIR}../logs/scraping
  28. SECS_DIR=${BASEDIR}../data/securities/
  29. GET_YAHOO=0
  30. SELF=`basename $0`
  31. PIDFILE=${BASEDIR}../logs/${SELF}.pid
  32. ERROR_STRING='default error string'
  33.  
  34. # define functions first, put in include file later
  35.  
  36. # send sms msg, only once
  37. send_sms_msg () {
  38. if [ "$1" ]; then STRING=$1; fi
  39. if [ $SMS_SENT ]; then
  40. return 0
  41. else
  42. echo $STRING | mailx -s 'stockh error' 1234567789@cingularme.com
  43. SMS_SENT=1
  44. fi
  45. }
  46.  
  47. log_error () {
  48. if [ "$1" ]; then
  49. STRING="OOPS: return code $? because $1"
  50. else
  51. STRING="OOPS: return code $?"
  52. fi
  53. date >> $LOGFILE
  54. echo $STRING >> $LOGFILE
  55. }
  56.  
  57. log_normal () {
  58. if [ "$1" ]; then
  59. STRING="OK: $1"
  60. else
  61. STRING="OK: seems ok $?"
  62. fi
  63. date >> $LOGFILE
  64. echo $STRING >> $LOGFILE
  65. }
  66.  
  67. save_symbol_data () {
  68. # save once in main file
  69. cat $TMP_FILE >> $DATA_FILE 2>> $LOGFILE
  70. # grep for each symbol and save in seperate files
  71. for j in $SYMBOLS; do
  72. echo -n ${GOT_WHEN}, >> ${SECS_DIR}$j.csv
  73. match=",\"$j\","
  74. grep -i $match $TMP_FILE >> ${SECS_DIR}$j.csv 2>> $LOGFILE
  75. done
  76. }
  77.  
  78. # Look if any symbols have changed in TMP_FILE, get that data too, append to TMP_FILE.
  79. fetch_changed_symbol_data () {
  80.  
  81. TMP=`grep '"Ticker symbol has changed to:' $TMP_FILE | sed 's/.*changed to: <a href="\/q?s=\(.*\)">.*/\1/'`
  82. TMP=`echo $TMP | tr -d '\n'`
  83. TMP=`echo $TMP | sed 's/^\s+//g'`
  84. NEW=$TMP
  85. if [ "$NEW" ]; then
  86. log_normal "got changed symbols: $NEW"
  87. # save new syms to global var $SYMBOLS
  88. SYMBOLS="${SYMBOLS} ${NEW}"
  89. TMP=`echo $NEW | tr ' ' '+'`
  90. URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
  91. # append data to tmp file
  92. lynx -dump $URL >> $TMP_FILE 2>> $LOGFILE
  93. # report errors
  94. if [ $? = 0 ]; then
  95. if [ $DEBUG ]; then log_normal "$URL" ; fi
  96. else
  97. log_error "lynx failed getting changed symbols $URL"
  98. send_sms_msg "lynx failed getting $NEW"
  99. fi
  100. fi
  101.  
  102. }
  103.  
  104. # take space seperate list of symbols, query yahoo and save to TMP_FILE
  105. fetch_and_save_symbol_data () {
  106.  
  107. # replace space with + for URL
  108. TMP=`echo $SYMBOLS | tr ' ' '+'`
  109. URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
  110. # clobber TMP_FILE with new data
  111. lynx -dump $URL > $TMP_FILE 2>> $LOGFILE
  112. if [ $? = 0 ]; then
  113. if [ $DEBUG ]; then log_normal "$URL" ; fi
  114. GOT_WHEN=`date +%Y%m%d%H%M%S | tr -d '\n'`
  115. fetch_changed_symbol_data
  116. # regardless of fetch_changed_symbol_data always save symbol data at
  117. # this point
  118. save_symbol_data
  119. else
  120. log_error "lynx failed getting $URL"
  121. send_sms_msg "lynx failed on $TMP"
  122. fi
  123. }
  124.  
  125. if [ -f $PIDFILE ]; then
  126. send_sms_msg "$SELF exiting, PID exists"
  127. log_error "$SELF exiting, PID exists"
  128. exit 1
  129. fi
  130.  
  131. echo $$ > $PIDFILE 2>> $LOGFILE
  132.  
  133. # save $COUNT amount of symbols in $SYMBOLS then call functions
  134. for i in `cat $SYMBOLS_FILE`; do
  135.  
  136. COUNT=`expr $COUNT + 1`
  137.  
  138. if [ "$SYMBOLS" ]; then
  139. SYMBOLS="$SYMBOLS $i"
  140. else
  141. SYMBOLS=$i
  142. fi
  143.  
  144. if [ $COUNT = $GET_SYMBOLS ]; then
  145. GET_YAHOO=1
  146. elif [ $i = $LAST_SYMBOL ]; then
  147. GET_YAHOO=1
  148. fi
  149.  
  150. if [ $GET_YAHOO = 1 ]; then
  151. if [ $DEBUG ]; then log_normal "SYMBOLS are $SYMBOLS"; fi
  152. fetch_and_save_symbol_data
  153. sleep $SLEEP_TIME
  154. SYMBOLS=''
  155. COUNT=0
  156. GET_YAHOO=0;
  157. fi
  158.  
  159. done
  160.  
  161.  
  162. echo $SELF started at $BEGIN_TIME >> $LOGFILE 2>&1
  163. echo $SELF finished on `date` >> $LOGFILE 2>&1
  164. wc -l $DATA_FILE >> $LOGFILE 2>&1
  165. wc -l $SYMBOLS_FILE >> $LOGFILE 2>&1
  166. rm -f $PIDFILE >> $LOGFILE 2>&1
  167.  
  168. if [ $? = 0 ]; then
  169. exit 0
  170. else
  171. send_sms_msg "could not remove PIDFILE. rm returned $?"
  172. exit 1
  173. fi

Report this snippet  

You need to login to post a comment.